Faster grouping, in order, with threads

2026-02-25 21:07:28 +01:00 · 2026-02-25 21:07:28 +01:00 · f3dc5c452a
parent b13ed34acf
commit f3dc5c452a
1 changed files with 25 additions and 46 deletions
--- a/grouping.py
+++ b/grouping.py
@ -4,6 +4,7 @@ import re
 import sys
 import shutil
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor
 from PIL import Image, ImageDraw, ImageFont
 from pdf2image import convert_from_path, pdfinfo_from_path
@ -105,40 +106,6 @@ def group_files(file_list):
    # Return list of lists (strip the metadata)
    return [g['items'] for g in groups]
 # def group_files(file_list):
 #     """Groups files based on constraints."""
 #     sorted_files = sorted(file_list, key=lambda x: x[0])
 #     groups = []
 #     current_group = []
 #     current_height = 0
 #     for item in sorted_files:
 #         dd, path, height = item
 #         # Calculate added height (image + separator + approx text space)
 #         # We add separator height only if it's not the first image
 #         added_overhead = SEPARATOR_HEIGHT + 30 if current_group else 0
 #         # Check conditions
 #         if (len(current_group) >= MAX_GROUP_COUNT or
 #             (current_height + height + added_overhead) > MAX_GROUP_HEIGHT):
 #             # Push current group and start new
 #             if current_group:
 #                 groups.append(current_group)
 #             current_group = []
 #             current_height = 0
 #             added_overhead = 0 # Reset for first file of new group
 #         current_group.append(item)
 #         current_height += height + added_overhead
 #     if current_group:
 #         groups.append(current_group)
 #     return groups
 def stitch_pdf_pages(images_list):
    """Vertically concatenates a list of PIL images with no separator."""
    if not images_list:
@ -242,19 +209,11 @@ def create_jpg(identifier, group_index, group, root_dir):
    print(f"Saved {output_path} with {len(group)} ({os.path.getsize(output_path)/1024/1024:.2f} MB)")
-def main():
+def natural_key(text):
-    if len(sys.argv) < 2:
+    return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]
        print("Usage: python app.py <Path_to_Dir>")
        sys.exit(1)
    root_dir = sys.argv[1]
-    print("Scanning files...")
+def process_identifier(identifier, files_info, root_dir):
    data = collect_files(root_dir)
    print(f"Found {len(data)} identifiers. Processing...")
    for identifier, files_info in data.items():
    # Clear output directory if it exists
    target_folder = os.path.join(root_dir, identifier)
    if os.path.exists(target_folder):
@ -267,6 +226,26 @@ def main():
    for idx, group in enumerate(file_groups):
        create_jpg(identifier, idx, group, root_dir)
 def main():
    if len(sys.argv) < 2:
        print("Usage: python app.py <Path_to_Dir>")
        sys.exit(1)
    root_dir = sys.argv[1]
    print("Scanning files...")
    data = collect_files(root_dir)
    print(f"Found {len(data)} identifiers. Processing...")
    # Sort identifiers naturally
    sorted_identifiers = sorted(data.keys(), key=natural_key)
    # Process using 4 threads
    with ThreadPoolExecutor(max_workers=4) as executor:
        for identifier in sorted_identifiers:
            executor.submit(process_identifier, identifier, data[identifier], root_dir)
    print("Done.")
 if __name__ == "__main__":