import os import json import re import sys import shutil from collections import defaultdict from PIL import Image, ImageDraw, ImageFont from pdf2image import convert_from_path, pdfinfo_from_path # Configuration DPI = 200 # Good balance for readability and size A4_HEIGHT_INCHES = 11.69 FULL_PAGE_PX = int(A4_HEIGHT_INCHES * DPI) MAX_GROUP_HEIGHT = 2.5 * FULL_PAGE_PX MAX_GROUP_COUNT = 15 SEPARATOR_HEIGHT = 20 LABEL_HEIGHT = 50 MAX_FILE_SIZE_BYTES = 2.5 * 1024 * 1024 # 2MB # def get_pdf_height(path): # """Returns height in pixels at defined DPI without rendering.""" # try: # info = pdfinfo_from_path(path) # # info["Page size"] is usually "width height pts" # # 1 pt = 1/72 inch # # We assume single page PDFs as per prompt implication, or take the first page # pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0 # return int((pts_height / 72.0) * DPI) # except Exception as e: # print(f"Error reading {path}: {e}") # return 0 def get_pdf_height(path): """Returns total height of all pages in pixels at defined DPI.""" try: info = pdfinfo_from_path(path) # Get page count (default to 1) num_pages = int(info["Pages"]) if "Pages" in info else 1 # 1 pt = 1/72 inch pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0 # Height of one page in pixels single_page_px = int((pts_height / 72.0) * DPI) # Return total height return single_page_px * num_pages except Exception as e: print(f"Error reading {path}: {e}") return 0 def collect_files(root_dir): """ Scans Dir/Copiedd/identifier.pdf Returns dict: {identifier: [(dd, path, height), ...]} """ data = defaultdict(list) # Regex to match 'Copie' followed by 2 digits folder_pattern = re.compile(r'Copie(\d{2})') for root, dirs, files in os.walk(root_dir): folder_name = os.path.basename(root) match = folder_pattern.match(folder_name) if match: dd = match.group(1) for file in files: if file.lower().endswith('.pdf'): identifier = os.path.splitext(file)[0] full_path = os.path.join(root, file) # Calculate height (c) height = get_pdf_height(full_path) # Store triple (a, b, c) data[identifier].append((dd, full_path, height)) return data def group_files(file_list): """Groups files based on constraints.""" sorted_files = sorted(file_list, key=lambda x: x[0]) groups = [] current_group = [] current_height = 0 for item in sorted_files: dd, path, height = item # Calculate added height (image + separator + approx text space) # We add separator height only if it's not the first image added_overhead = SEPARATOR_HEIGHT + 30 if current_group else 0 # Check conditions if (len(current_group) >= MAX_GROUP_COUNT or (current_height + height + added_overhead) > MAX_GROUP_HEIGHT): # Push current group and start new if current_group: groups.append(current_group) current_group = [] current_height = 0 added_overhead = 0 # Reset for first file of new group current_group.append(item) current_height += height + added_overhead if current_group: groups.append(current_group) return groups def stitch_pdf_pages(images_list): """Vertically concatenates a list of PIL images with no separator.""" if not images_list: return None if len(images_list) == 1: return images_list[0] max_width = max(img.width for img in images_list) total_height = sum(img.height for img in images_list) combined = Image.new('RGB', (max_width, total_height), 'white') y_offset = 0 for img in images_list: combined.paste(img, (0, y_offset)) y_offset += img.height return combined def create_jpg(identifier, group_index, group, root_dir): images = [] metadata = [] # To store (id, h_min, h_max) # Render PDFs to images for dd, path, _ in group: try: # Convert pdf to image imgs = convert_from_path(path, dpi=DPI) # if imgs: # images.append((dd, imgs[0])) # Assume 1 page per pdf !! ?? if imgs: # Concatenate multi-page PDFs into one single image object combined_img = stitch_pdf_pages(imgs) if combined_img: images.append((dd, combined_img)) except Exception as e: print(f"Failed to convert {path}: {e}") if not images: return # Calculate total canvas size total_width = max(img.width for _, img in images) total_height = sum(img.height for _, img in images) + ((len(images) - 1) * SEPARATOR_HEIGHT) # Add space for text (approx 40px per label) total_height += len(images) * LABEL_HEIGHT canvas = Image.new('RGB', (total_width, total_height), 'white') draw = ImageDraw.Draw(canvas) # Try loading a font, fallback to default try: font = ImageFont.truetype("DejaVuSans.ttf", 40) except IOError: print("font not found") font = ImageFont.load_default() y_offset = 0 for i, (dd, img) in enumerate(images): # Draw separator if not first image if i > 0: draw.rectangle([0, y_offset, total_width, y_offset + SEPARATOR_HEIGHT], fill='black') y_offset += SEPARATOR_HEIGHT # Draw Text (dd) text = f"ID: {dd}" draw.text((10, y_offset + 5), text, fill='black', font=font) y_offset += LABEL_HEIGHT # Space for text # Record Image Coordinates h_min = y_offset h_max = y_offset + img.height metadata.append((dd, h_min, h_max)) # Draw Image x_pos = 0 canvas.paste(img, (x_pos, y_offset)) y_offset += img.height target_folder = os.path.join(root_dir, identifier) os.makedirs(target_folder, exist_ok=True) # Save JSON metadata json_filename = f"Group_{group_index+1}.json" json_path = os.path.join(target_folder, json_filename) with open(json_path, 'w') as f: json.dump(metadata, f) # Save with size constraints output_filename = f"Group_{group_index+1}.jpg" output_path = os.path.join(target_folder, output_filename) quality = 90 while quality > 10: canvas.save(output_path, "JPEG", quality=quality, optimize=True) if os.path.getsize(output_path) <= MAX_FILE_SIZE_BYTES: if quality < 90: print("quality : ", quality) break quality -= 5 print(f"Saved {output_path} ({os.path.getsize(output_path)/1024/1024:.2f} MB)") def main(): if len(sys.argv) < 2: print("Usage: python app.py ") sys.exit(1) root_dir = sys.argv[1] print("Scanning files...") data = collect_files(root_dir) print(f"Found {len(data)} identifiers. Processing...") for identifier, files_info in data.items(): # Clear output directory if it exists target_folder = os.path.join(root_dir, identifier) if os.path.exists(target_folder): shutil.rmtree(target_folder) os.makedirs(target_folder, exist_ok=True) # files_info is list of (dd, path, height) file_groups = group_files(files_info) for idx, group in enumerate(file_groups): create_jpg(identifier, idx, group, root_dir) print("Done.") if __name__ == "__main__": main()