From 4584993106f53aedcf5f13fbe12a81db6b2af7b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Miquel?= Date: Tue, 10 Mar 2026 15:23:49 +0100 Subject: [PATCH] Better group sizes (bigger, slightly more homogeneous) --- annotating_by_label.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/annotating_by_label.py b/annotating_by_label.py index 6f4be3b..372cab6 100644 --- a/annotating_by_label.py +++ b/annotating_by_label.py @@ -11,8 +11,8 @@ import annotating import annotating_with_checks # Roughly 10 A4 pages at 100 DPI -MAX_HEIGHT_PX = 11690 -# MAX_HEIGHT_PX = 18000 +# MAX_HEIGHT_PX = 11690 +MAX_HEIGHT_PX = 17000 # Can be increased by 10%. # MAX_HEIGHT_PX = 16000 def render_item(item): @@ -147,24 +147,42 @@ def main(): batches = [] current_batch = [] current_h = 0 - for r in rendered: sid = r[0] img_h = r[2].height - # Split if we exceed max height AND we are on a new student if current_batch and current_h + img_h > MAX_HEIGHT_PX and sid != last_sid: batches.append(current_batch) current_batch = [] current_h = 0 - current_batch.append(r) current_h += img_h last_sid = sid - if current_batch: batches.append(current_batch) + batches2 = [] + current_batch2 = [] + current_h2 = 0 + last_sid2 = None + for r in rendered: + sid = r[0] + img_h = r[2].height + # Split if we exceed max height AND we are on a new student + if current_batch2 and current_h2 + img_h > 1.1 *MAX_HEIGHT_PX \ + and sid != last_sid2: + batches2.append(current_batch2) + current_batch2 = [] + current_h2 = 0 + current_batch2.append(r) + current_h2 += img_h + last_sid2 = sid + if current_batch2: + batches2.append(current_batch2) + + if len(batches2) < len(batches): + batches = batches2 + for i, batch in enumerate(batches, 1): save_batch(batch, prefix, i, root_dir, args.overwrite)