import os import json import re import sys import shutil from collections import defaultdict from concurrent.futures import ThreadPoolExecutor from PIL import Image, ImageDraw, ImageFont from pdf2image import convert_from_path, pdfinfo_from_path # Configuration DPI = 200 # Good balance for readability and size A4_HEIGHT_INCHES = 11.69 FULL_PAGE_PX = int(A4_HEIGHT_INCHES * DPI) MAX_GROUP_HEIGHT = 1.5 * FULL_PAGE_PX MAX_GROUP_COUNT = 8 SEPARATOR_HEIGHT = 20 LABEL_HEIGHT = 50 MAX_FILE_SIZE_BYTES = 2.5 * 1024 * 1024 # 2MB def get_pdf_height(path): """Returns total height of all pages in pixels at defined DPI.""" try: info = pdfinfo_from_path(path) # Get page count (default to 1) num_pages = int(info["Pages"]) if "Pages" in info else 1 # 1 pt = 1/72 inch pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0 # Height of one page in pixels single_page_px = int((pts_height / 72.0) * DPI) # Return total height return single_page_px * num_pages except Exception as e: print(f"Error reading {path}: {e}") return 0 def collect_files(root_dir): """ Scans Dir/Copiedd/identifier.pdf Returns dict: {identifier: [(dd, path, height), ...]} """ data = defaultdict(list) # Regex to match 'Copie' followed by 2 digits folder_pattern = re.compile(r'Copie(\d{2})') for root, dirs, files in os.walk(root_dir): folder_name = os.path.basename(root) match = folder_pattern.match(folder_name) if match: dd = match.group(1) for file in files: if file.lower().endswith('.pdf'): identifier = os.path.splitext(file)[0] full_path = os.path.join(root, file) # Calculate height (c) height = get_pdf_height(full_path) # Store triple (a, b, c) data[identifier].append((dd, full_path, height)) return data def group_files(file_list): """ Groups files using First Fit Decreasing algorithm to minimize group count. """ # 1. Sort by height DESCENDING. Large items are hardest to fit, handle them first. # (Remove this sort if you must strictly preserve input order logic) sorted_files = sorted(file_list, key=lambda x: x[2], reverse=True) # Each group is a dict: {'items': [], 'current_height': 0} groups = [] for item in sorted_files: dd, path, height = item placed = False # 2. Try to fit item into an existing group (First Fit) for group in groups: # Check Count Constraint if len(group['items']) >= MAX_GROUP_COUNT: continue # Calculate Overhead (only if group is not empty) overhead = (SEPARATOR_HEIGHT + 30) if group['items'] else 0 # Check Height Constraint if group['current_height'] + height + overhead <= MAX_GROUP_HEIGHT: group['items'].append(item) group['current_height'] += height + overhead placed = True break # 3. If it doesn't fit anywhere, create a new group if not placed: groups.append({ 'items': [item], 'current_height': height }) # Return list of lists (strip the metadata) return [g['items'] for g in groups] def stitch_pdf_pages(images_list): """Vertically concatenates a list of PIL images with no separator.""" if not images_list: return None if len(images_list) == 1: return images_list[0] max_width = max(img.width for img in images_list) total_height = sum(img.height for img in images_list) combined = Image.new('RGB', (max_width, total_height), 'white') y_offset = 0 for img in images_list: combined.paste(img, (0, y_offset)) y_offset += img.height return combined def create_jpg(identifier, group_index, group, root_dir): images = [] metadata = [] # To store (id, h_min, h_max) # Render PDFs to images for dd, path, _ in group: try: # Convert pdf to image imgs = convert_from_path(path, dpi=DPI) if imgs: # Concatenate multi-page PDFs into one single image object combined_img = stitch_pdf_pages(imgs) if combined_img: images.append((dd, combined_img)) except Exception as e: print(f"Failed to convert {path}: {e}") if not images: return # Calculate total canvas size total_width = max(img.width for _, img in images) total_height = sum(img.height for _, img in images) + ((len(images) - 1) * SEPARATOR_HEIGHT) # Add space for text (approx 40px per label) total_height += len(images) * LABEL_HEIGHT canvas = Image.new('RGB', (total_width, total_height), 'white') draw = ImageDraw.Draw(canvas) # Try loading a font, fallback to default try: font = ImageFont.truetype("DejaVuSans.ttf", 40) except IOError: print("font not found") font = ImageFont.load_default() y_offset = 0 for i, (dd, img) in enumerate(images): # Draw separator if not first image if i > 0: draw.rectangle([0, y_offset, total_width, y_offset + SEPARATOR_HEIGHT], fill='black') y_offset += SEPARATOR_HEIGHT # Draw Text (dd) text = f"ID: {dd}" draw.text((10, y_offset + 5), text, fill='black', font=font) y_offset += LABEL_HEIGHT # Space for text # Record Image Coordinates h_min = y_offset h_max = y_offset + img.height metadata.append((dd, h_min, h_max)) # Draw Image x_pos = 0 canvas.paste(img, (x_pos, y_offset)) y_offset += img.height target_folder = os.path.join(root_dir, identifier) os.makedirs(target_folder, exist_ok=True) # Save JSON metadata json_filename = f"Group_{group_index+1}.json" json_path = os.path.join(target_folder, json_filename) with open(json_path, 'w') as f: json.dump(metadata, f) # Save with size constraints output_filename = f"Group_{group_index+1}.jpg" output_path = os.path.join(target_folder, output_filename) quality = 90 while quality > 10: canvas.save(output_path, "JPEG", quality=quality, optimize=True) if os.path.getsize(output_path) <= MAX_FILE_SIZE_BYTES: if quality < 90: print("quality : ", quality) break quality -= 5 print(f"Saved {output_path} with {len(group)} ({os.path.getsize(output_path)/1024/1024:.2f} MB)") def natural_key(text): return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))] def process_identifier(identifier, files_info, root_dir): # Clear output directory if it exists target_folder = os.path.join(root_dir, identifier) if os.path.exists(target_folder): shutil.rmtree(target_folder) os.makedirs(target_folder, exist_ok=True) # files_info is list of (dd, path, height) file_groups = group_files(files_info) for idx, group in enumerate(file_groups): create_jpg(identifier, idx, group, root_dir) def main(): if len(sys.argv) < 2: print("Usage: python app.py ") sys.exit(1) root_dir = sys.argv[1] print("Scanning files...") data = collect_files(root_dir) print(f"Found {len(data)} identifiers. Processing...") # Sort identifiers naturally sorted_identifiers = sorted(data.keys(), key=natural_key) # Process using 4 threads with ThreadPoolExecutor(max_workers=4) as executor: for identifier in sorted_identifiers: executor.submit(process_identifier, identifier, data[identifier], root_dir) print("Done.") if __name__ == "__main__": main()