248 lines
8.5 KiB
Python
248 lines
8.5 KiB
Python
import sys
|
|
import os
|
|
import json
|
|
import shutil
|
|
import argparse
|
|
import concurrent.futures
|
|
from PIL import Image, ImageDraw
|
|
from reportlab.pdfgen import canvas
|
|
|
|
import annotating
|
|
import annotating_with_checks
|
|
|
|
from utils import natural_key
|
|
|
|
MAX_HEIGHT_PX = 25000 # Can be increased by 10%.
|
|
|
|
def render_item(item):
|
|
student_id, label, content = item
|
|
pdf_path = content['pdf_path']
|
|
if not os.path.exists(pdf_path):
|
|
print("no pdf path for ", pdf_path)
|
|
return None
|
|
|
|
base_img, _, _ = annotating.make_base_image(pdf_path)
|
|
cb_renderer = annotating_with_checks.CheckboxRenderer(label)
|
|
|
|
final_img, header_h = annotating.compose_label_image(
|
|
base_img, label, content['result'], content['coordinates'][0],
|
|
draw_callback=cb_renderer.callback,
|
|
more_right=True,
|
|
with_id=student_id
|
|
)
|
|
if final_img is None:
|
|
return None
|
|
|
|
return (student_id, label, final_img, header_h, cb_renderer.checkboxes)
|
|
|
|
def save_batch(batch, prefix, group_id, root_dir, overwrite):
|
|
output_dir = os.path.join(root_dir, "BGnot", f"{prefix} G{group_id}")
|
|
|
|
if os.path.exists(output_dir):
|
|
if not overwrite:
|
|
print(f"Skipping {output_dir}: Output already exists.")
|
|
return
|
|
shutil.rmtree(output_dir)
|
|
|
|
print(f"Generating Group PDF: {prefix} G{group_id} ({len(batch)} elements)")
|
|
os.makedirs(output_dir)
|
|
|
|
max_w = max(item[2].width for item in batch)
|
|
total_h = sum(item[2].height for item in batch)
|
|
concat_img = Image.new("RGB", (max_w, total_h), "white")
|
|
draw = ImageDraw.Draw(concat_img)
|
|
|
|
final_json_map = []
|
|
bnote_entries = []
|
|
current_y = 0
|
|
last_sid = None
|
|
|
|
for sid, label, img, header_h, boxes in batch:
|
|
concat_img.paste(img, (0, current_y))
|
|
|
|
if sid != last_sid:
|
|
draw.rectangle([0, current_y, max_w, current_y + 4], fill="purple")
|
|
last_sid = sid
|
|
|
|
bnote_entries.append({
|
|
"id": sid,
|
|
"label": label,
|
|
"header_height": header_h,
|
|
"hmin": current_y,
|
|
"hmax": current_y + img.height
|
|
})
|
|
|
|
for item in boxes:
|
|
b = item.get('final_box') or item.get('rel_box')
|
|
item['global_box'] = [b[0], b[1] + current_y, b[2], b[3] + current_y]
|
|
item['student_id'] = sid # Required to map checkbox to the correct student
|
|
final_json_map.append(item)
|
|
|
|
current_y += img.height
|
|
|
|
with open(os.path.join(output_dir, "bnote.json"), "w") as f:
|
|
json.dump({"width": max_w, "height": total_h, "images": bnote_entries}, f, indent=2)
|
|
|
|
with open(os.path.join(output_dir, "checkboxes.json"), "w") as f:
|
|
json.dump(final_json_map, f, indent=2)
|
|
|
|
temp_img_path = os.path.join(output_dir, "Reference.jpg")
|
|
concat_img.save(temp_img_path, quality=90)
|
|
|
|
pdf_path = os.path.join(output_dir, "Concat.pdf")
|
|
w, h = concat_img.size
|
|
c = canvas.Canvas(pdf_path, pagesize=(w, h))
|
|
c.drawImage(temp_img_path, 0, 0, width=w, height=h)
|
|
c.save()
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Generate annotated PDFs grouped by labels.")
|
|
parser.add_argument("input_path", help="Directory containing Bnot structure")
|
|
parser.add_argument("--overwrite", action="store_true", help="Overwrite existing output files")
|
|
args = parser.parse_args()
|
|
|
|
root_dir = args.input_path
|
|
results = annotating.make_dictionary(root_dir)
|
|
label_groups = os.path.join(root_dir, "label_groups")
|
|
all_labels = os.path.join(root_dir, "labels")
|
|
|
|
if not os.path.exists(label_groups):
|
|
print(f"Warning: Labels file '{label_groups}' not found, making it out of '{all_labels}'")
|
|
if not os.path.exists(all_labels):
|
|
print(f"Error: {all_labels} not found.")
|
|
sys.exit(1)
|
|
|
|
with open(all_labels, 'r') as f:
|
|
lines = [l.strip() for l in f if l.strip()]
|
|
|
|
groups = {}
|
|
for line in lines:
|
|
# Key is the part before the colon, or the whole line if no colon
|
|
key = line.split(' : ')[0] if ' : ' in line else line
|
|
groups.setdefault(key, []).append(line)
|
|
|
|
with open(label_groups, 'w') as f:
|
|
for items in groups.values():
|
|
f.write(",".join(items) + "\n")
|
|
|
|
with open(label_groups, "r") as f:
|
|
lines = [line.strip() for line in f if line.strip()]
|
|
|
|
bgnot_dir = os.path.join(root_dir, "BGnot")
|
|
if args.overwrite and os.path.exists(bgnot_dir):
|
|
shutil.rmtree(bgnot_dir)
|
|
os.makedirs(bgnot_dir, exist_ok=True)
|
|
|
|
used_prefixes = set()
|
|
|
|
previous_prefix = None
|
|
for line in lines:
|
|
labels = [l.strip() for l in line.split(',') if l.strip()]
|
|
safe_labels = [l.replace(":", "").strip() for l in line.split(',') if l.strip()]
|
|
if not labels:
|
|
continue
|
|
|
|
base_prefix = os.path.commonprefix(safe_labels).strip()
|
|
if not base_prefix:
|
|
base_prefix = "Group"
|
|
|
|
unique_prefix = base_prefix
|
|
if unique_prefix[-1] == "i":
|
|
unique_prefix = unique_prefix[:-1]
|
|
counter = 2
|
|
while unique_prefix in used_prefixes:
|
|
unique_prefix = f"{base_prefix}-{counter}"
|
|
counter += 1
|
|
if counter == 2 and previous_prefix and previous_prefix in unique_prefix:
|
|
unique_prefix = f"{previous_prefix}-{counter}"
|
|
elif counter == 2:
|
|
previous_prefx = unique_prefix
|
|
|
|
used_prefixes.add(unique_prefix)
|
|
|
|
existing_items = set()
|
|
max_existing_group = 0
|
|
|
|
|
|
if not args.overwrite and os.path.exists(bgnot_dir):
|
|
for d in os.listdir(bgnot_dir):
|
|
if d.startswith(f"{unique_prefix} G"):
|
|
try:
|
|
g_id = int(d.split(' G')[-1])
|
|
max_existing_group = max(max_existing_group, g_id)
|
|
except ValueError:
|
|
pass
|
|
|
|
bnote_path = os.path.join(bgnot_dir, d, "bnote.json")
|
|
if os.path.exists(bnote_path):
|
|
with open(bnote_path, "r") as bf:
|
|
bdata = json.load(bf)
|
|
for img in bdata.get("images", []):
|
|
existing_items.add((img["id"], img["label"]))
|
|
|
|
items_to_render = []
|
|
for sid, lbls in results.items():
|
|
for lbl in labels:
|
|
if lbl in lbls:
|
|
# Only add if it hasn't been generated yet
|
|
if (sid, lbl) not in existing_items:
|
|
items_to_render.append((sid, lbl, lbls[lbl]))
|
|
if not items_to_render:
|
|
continue
|
|
|
|
# Sort structurally: by student id and label
|
|
items_to_render.sort(key=lambda x: (natural_key(x[0]), natural_key(x[1])))
|
|
# Render images in parallel using the pre-existing lock & render function
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
rendered = list(executor.map(render_item, items_to_render))
|
|
|
|
rendered = [r for r in rendered if r is not None]
|
|
if not rendered:
|
|
continue
|
|
|
|
# Split into constrained height batches
|
|
batches = []
|
|
current_batch = []
|
|
current_h = 0
|
|
for r in rendered:
|
|
sid = r[0]
|
|
img_h = r[2].height
|
|
# Split if we exceed max height AND we are on a new student
|
|
if current_batch and current_h + img_h > MAX_HEIGHT_PX and sid != last_sid:
|
|
batches.append(current_batch)
|
|
current_batch = []
|
|
current_h = 0
|
|
current_batch.append(r)
|
|
current_h += img_h
|
|
last_sid = sid
|
|
if current_batch:
|
|
batches.append(current_batch)
|
|
|
|
batches2 = []
|
|
current_batch2 = []
|
|
current_h2 = 0
|
|
last_sid2 = None
|
|
for r in rendered:
|
|
sid = r[0]
|
|
img_h = r[2].height
|
|
# Split if we exceed max height AND we are on a new student
|
|
if current_batch2 and current_h2 + img_h > 1.1 *MAX_HEIGHT_PX \
|
|
and sid != last_sid2:
|
|
batches2.append(current_batch2)
|
|
current_batch2 = []
|
|
current_h2 = 0
|
|
current_batch2.append(r)
|
|
current_h2 += img_h
|
|
last_sid2 = sid
|
|
if current_batch2:
|
|
batches2.append(current_batch2)
|
|
|
|
if len(batches2) < len(batches):
|
|
batches = batches2
|
|
|
|
for i, batch in enumerate(batches, 1):
|
|
save_batch(batch, unique_prefix, max_existing_group + i, root_dir, args.overwrite)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|