Faster grouping, in order, with threads

master
Sébastien Miquel 2026-02-25 21:07:28 +01:00
parent b13ed34acf
commit f3dc5c452a
1 changed files with 25 additions and 46 deletions

View File

@ -4,6 +4,7 @@ import re
import sys import sys
import shutil import shutil
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from PIL import Image, ImageDraw, ImageFont from PIL import Image, ImageDraw, ImageFont
from pdf2image import convert_from_path, pdfinfo_from_path from pdf2image import convert_from_path, pdfinfo_from_path
@ -105,40 +106,6 @@ def group_files(file_list):
# Return list of lists (strip the metadata) # Return list of lists (strip the metadata)
return [g['items'] for g in groups] return [g['items'] for g in groups]
# def group_files(file_list):
# """Groups files based on constraints."""
# sorted_files = sorted(file_list, key=lambda x: x[0])
# groups = []
# current_group = []
# current_height = 0
# for item in sorted_files:
# dd, path, height = item
# # Calculate added height (image + separator + approx text space)
# # We add separator height only if it's not the first image
# added_overhead = SEPARATOR_HEIGHT + 30 if current_group else 0
# # Check conditions
# if (len(current_group) >= MAX_GROUP_COUNT or
# (current_height + height + added_overhead) > MAX_GROUP_HEIGHT):
# # Push current group and start new
# if current_group:
# groups.append(current_group)
# current_group = []
# current_height = 0
# added_overhead = 0 # Reset for first file of new group
# current_group.append(item)
# current_height += height + added_overhead
# if current_group:
# groups.append(current_group)
# return groups
def stitch_pdf_pages(images_list): def stitch_pdf_pages(images_list):
"""Vertically concatenates a list of PIL images with no separator.""" """Vertically concatenates a list of PIL images with no separator."""
if not images_list: if not images_list:
@ -242,19 +209,11 @@ def create_jpg(identifier, group_index, group, root_dir):
print(f"Saved {output_path} with {len(group)} ({os.path.getsize(output_path)/1024/1024:.2f} MB)") print(f"Saved {output_path} with {len(group)} ({os.path.getsize(output_path)/1024/1024:.2f} MB)")
def main(): def natural_key(text):
if len(sys.argv) < 2: return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]
print("Usage: python app.py <Path_to_Dir>")
sys.exit(1)
root_dir = sys.argv[1]
print("Scanning files...") def process_identifier(identifier, files_info, root_dir):
data = collect_files(root_dir)
print(f"Found {len(data)} identifiers. Processing...")
for identifier, files_info in data.items():
# Clear output directory if it exists # Clear output directory if it exists
target_folder = os.path.join(root_dir, identifier) target_folder = os.path.join(root_dir, identifier)
if os.path.exists(target_folder): if os.path.exists(target_folder):
@ -267,6 +226,26 @@ def main():
for idx, group in enumerate(file_groups): for idx, group in enumerate(file_groups):
create_jpg(identifier, idx, group, root_dir) create_jpg(identifier, idx, group, root_dir)
def main():
if len(sys.argv) < 2:
print("Usage: python app.py <Path_to_Dir>")
sys.exit(1)
root_dir = sys.argv[1]
print("Scanning files...")
data = collect_files(root_dir)
print(f"Found {len(data)} identifiers. Processing...")
# Sort identifiers naturally
sorted_identifiers = sorted(data.keys(), key=natural_key)
# Process using 4 threads
with ThreadPoolExecutor(max_workers=4) as executor:
for identifier in sorted_identifiers:
executor.submit(process_identifier, identifier, data[identifier], root_dir)
print("Done.") print("Done.")
if __name__ == "__main__": if __name__ == "__main__":