import os import json import re import sys import shutil from pathlib import Path from collections import defaultdict from concurrent.futures import ThreadPoolExecutor from PIL import Image, ImageDraw, ImageFont from pdf2image import convert_from_path, pdfinfo_from_path # Configuration DPI = 200 # Good balance for readability and size A4_HEIGHT_INCHES = 11.69 FULL_PAGE_PX = int(A4_HEIGHT_INCHES * DPI) MAX_GROUP_HEIGHT = 1.5 * FULL_PAGE_PX MAX_GROUP_COUNT = 8 SEPARATOR_HEIGHT = 20 LABEL_HEIGHT = 50 MAX_FILE_SIZE_BYTES = 2.5 * 1024 * 1024 # 2MB def get_pdf_height(path): """Returns total height of all pages in pixels at defined DPI.""" try: info = pdfinfo_from_path(path) # Get page count (default to 1) num_pages = int(info["Pages"]) if "Pages" in info else 1 # 1 pt = 1/72 inch pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0 # Height of one page in pixels single_page_px = int((pts_height / 72.0) * DPI) # Return total height return single_page_px * num_pages except Exception as e: print(f"Error reading {path}: {e}") return 0 def collect_files(root_dir): """ Scans Dir/Copiedd/identifier.pdf Returns dict: {identifier: [(dd, path, height), ...]} """ data = defaultdict(list) # Regex to match 'Copie' followed by 2 digits folder_pattern = re.compile(r'Copie(\d{2})') for root, dirs, files in os.walk(root_dir): folder_name = os.path.basename(root) match = folder_pattern.match(folder_name) if match: dd = match.group(1) for file in files: if file.lower().endswith('.pdf'): identifier = os.path.splitext(file)[0] full_path = os.path.join(root, file) # Calculate height (c) height = get_pdf_height(full_path) # Store triple (a, b, c) data[identifier].append((dd, full_path, height)) return data def group_files(file_list): """ Groups files using First Fit Decreasing algorithm to minimize group count. """ # 1. Sort by height DESCENDING. Large items are hardest to fit, handle them first. # (Remove this sort if you must strictly preserve input order logic) sorted_files = sorted(file_list, key=lambda x: x[2], reverse=True) # Each group is a dict: {'items': [], 'current_height': 0} groups = [] for item in sorted_files: dd, path, height = item placed = False # 2. Try to fit item into an existing group (First Fit) for group in groups: # Check Count Constraint if len(group['items']) >= MAX_GROUP_COUNT: continue # Calculate Overhead (only if group is not empty) overhead = (SEPARATOR_HEIGHT + 30) if group['items'] else 0 # Check Height Constraint if group['current_height'] + height + overhead <= MAX_GROUP_HEIGHT: group['items'].append(item) group['current_height'] += height + overhead placed = True break # 3. If it doesn't fit anywhere, create a new group if not placed: groups.append({ 'items': [item], 'current_height': height }) # Return list of lists (strip the metadata) return [g['items'] for g in groups] def stitch_pdf_pages(images_list): """Vertically concatenates a list of PIL images with no separator.""" if not images_list: return None if len(images_list) == 1: return images_list[0] max_width = max(img.width for img in images_list) total_height = sum(img.height for img in images_list) combined = Image.new('RGB', (max_width, total_height), 'white') y_offset = 0 for img in images_list: combined.paste(img, (0, y_offset)) y_offset += img.height return combined def create_jpg(identifier, group_index, group, root_dir): images = [] metadata = [] # To store (id, h_min, h_max) # Render PDFs to images for dd, path, _ in group: try: # Convert pdf to image imgs = convert_from_path(path, dpi=DPI) if imgs: # Concatenate multi-page PDFs into one single image object combined_img = stitch_pdf_pages(imgs) if combined_img: images.append((dd, combined_img)) except Exception as e: print(f"Failed to convert {path}: {e}") if not images: return # Calculate total canvas size total_width = max(img.width for _, img in images) total_height = sum(img.height for _, img in images) + ((len(images) - 1) * SEPARATOR_HEIGHT) # Add space for text (approx 40px per label) total_height += len(images) * LABEL_HEIGHT canvas = Image.new('RGB', (total_width, total_height), 'white') draw = ImageDraw.Draw(canvas) # Try loading a font, fallback to default try: font = ImageFont.truetype("DejaVuSans.ttf", 40) except IOError: print("font not found") font = ImageFont.load_default() y_offset = 0 for i, (dd, img) in enumerate(images): # Draw separator if not first image if i > 0: draw.rectangle([0, y_offset, total_width, y_offset + SEPARATOR_HEIGHT], fill='black') y_offset += SEPARATOR_HEIGHT # Draw Text (dd) text = f"ID: {dd}" draw.text((10, y_offset + 5), text, fill='black', font=font) y_offset += LABEL_HEIGHT # Space for text # Record Image Coordinates h_min = y_offset h_max = y_offset + img.height # identifier should be a label metadata.append((dd, h_min, h_max, img.width/total_width, identifier)) # Draw Image x_pos = 0 canvas.paste(img, (x_pos, y_offset)) y_offset += img.height target_folder = os.path.join(root_dir, identifier) os.makedirs(target_folder, exist_ok=True) # Save JSON metadata json_filename = f"Group_{group_index+1}.json" json_path = os.path.join(target_folder, json_filename) with open(json_path, 'w') as f: json.dump(metadata, f) # Save with size constraints output_filename = f"Group_{group_index+1}.jpg" output_path = os.path.join(target_folder, output_filename) quality = 90 while quality > 10: canvas.save(output_path, "JPEG", quality=quality, optimize=True) if os.path.getsize(output_path) <= MAX_FILE_SIZE_BYTES: if quality < 90: print("quality : ", quality) break quality -= 5 print(f"Saved {output_path} with {len(group)} ({os.path.getsize(output_path)/1024/1024:.2f} MB)") from utils import natural_key def process_identifier(identifier, files_info, output_dir): # Clear output directory if it exists target_folder = os.path.join(output_dir, identifier) if os.path.exists(target_folder): shutil.rmtree(target_folder) os.makedirs(target_folder, exist_ok=True) # files_info is list of (dd, path, height) file_groups = group_files(files_info) for idx, group in enumerate(file_groups): create_jpg(identifier, idx, group, output_dir) def main(): if len(sys.argv) < 2: print("Usage: python app.py ") sys.exit(1) root_dir = Path(sys.argv[1]) copies_dir = root_dir / "Copies" par_label_dir = root_dir / "Par label" print("Scanning files...") data = collect_files(copies_dir) print(f"Found {len(data)} identifiers. Processing...") # Sort identifiers naturally sorted_identifiers = sorted(data.keys(), key=natural_key) # Process using 8 threads with ThreadPoolExecutor(max_workers=8) as executor: for identifier in sorted_identifiers: executor.submit(process_identifier, identifier, data[identifier], par_label_dir) print("Done.") if __name__ == "__main__": main()