Faster grouping, in order, with threads
parent
b13ed34acf
commit
f3dc5c452a
69
grouping.py
69
grouping.py
|
|
@ -4,6 +4,7 @@ import re
|
|||
import sys
|
||||
import shutil
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
from pdf2image import convert_from_path, pdfinfo_from_path
|
||||
|
||||
|
|
@ -105,40 +106,6 @@ def group_files(file_list):
|
|||
# Return list of lists (strip the metadata)
|
||||
return [g['items'] for g in groups]
|
||||
|
||||
# def group_files(file_list):
|
||||
# """Groups files based on constraints."""
|
||||
# sorted_files = sorted(file_list, key=lambda x: x[0])
|
||||
|
||||
# groups = []
|
||||
# current_group = []
|
||||
# current_height = 0
|
||||
|
||||
# for item in sorted_files:
|
||||
# dd, path, height = item
|
||||
|
||||
# # Calculate added height (image + separator + approx text space)
|
||||
# # We add separator height only if it's not the first image
|
||||
# added_overhead = SEPARATOR_HEIGHT + 30 if current_group else 0
|
||||
|
||||
# # Check conditions
|
||||
# if (len(current_group) >= MAX_GROUP_COUNT or
|
||||
# (current_height + height + added_overhead) > MAX_GROUP_HEIGHT):
|
||||
|
||||
# # Push current group and start new
|
||||
# if current_group:
|
||||
# groups.append(current_group)
|
||||
# current_group = []
|
||||
# current_height = 0
|
||||
# added_overhead = 0 # Reset for first file of new group
|
||||
|
||||
# current_group.append(item)
|
||||
# current_height += height + added_overhead
|
||||
|
||||
# if current_group:
|
||||
# groups.append(current_group)
|
||||
|
||||
# return groups
|
||||
|
||||
def stitch_pdf_pages(images_list):
|
||||
"""Vertically concatenates a list of PIL images with no separator."""
|
||||
if not images_list:
|
||||
|
|
@ -242,6 +209,23 @@ def create_jpg(identifier, group_index, group, root_dir):
|
|||
|
||||
print(f"Saved {output_path} with {len(group)} ({os.path.getsize(output_path)/1024/1024:.2f} MB)")
|
||||
|
||||
def natural_key(text):
|
||||
return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]
|
||||
|
||||
|
||||
def process_identifier(identifier, files_info, root_dir):
|
||||
# Clear output directory if it exists
|
||||
target_folder = os.path.join(root_dir, identifier)
|
||||
if os.path.exists(target_folder):
|
||||
shutil.rmtree(target_folder)
|
||||
os.makedirs(target_folder, exist_ok=True)
|
||||
|
||||
# files_info is list of (dd, path, height)
|
||||
file_groups = group_files(files_info)
|
||||
|
||||
for idx, group in enumerate(file_groups):
|
||||
create_jpg(identifier, idx, group, root_dir)
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python app.py <Path_to_Dir>")
|
||||
|
|
@ -254,18 +238,13 @@ def main():
|
|||
|
||||
print(f"Found {len(data)} identifiers. Processing...")
|
||||
|
||||
for identifier, files_info in data.items():
|
||||
# Clear output directory if it exists
|
||||
target_folder = os.path.join(root_dir, identifier)
|
||||
if os.path.exists(target_folder):
|
||||
shutil.rmtree(target_folder)
|
||||
os.makedirs(target_folder, exist_ok=True)
|
||||
# Sort identifiers naturally
|
||||
sorted_identifiers = sorted(data.keys(), key=natural_key)
|
||||
|
||||
# files_info is list of (dd, path, height)
|
||||
file_groups = group_files(files_info)
|
||||
|
||||
for idx, group in enumerate(file_groups):
|
||||
create_jpg(identifier, idx, group, root_dir)
|
||||
# Process using 4 threads
|
||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||
for identifier in sorted_identifiers:
|
||||
executor.submit(process_identifier, identifier, data[identifier], root_dir)
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue