import sys from functools import lru_cache import os import time import json # Added for schema output import tkinter as tk from threading import Thread from queue import Queue, Empty from pdf2image import convert_from_path from PIL import Image, ImageTk # --- Configuration --- DELIMITER_WIDTH = 5 DELIMITER_COLOR = (0, 0, 0) OUTPUT_SIZE = (1800, 1000) if len(sys.argv) < 2: sys.exit("Usage: python script.py ") path_arg = sys.argv[1] files = [] INPUT_DIR = "" if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'): INPUT_DIR = os.path.dirname(path_arg) files = [os.path.basename(path_arg)] elif os.path.isdir(path_arg): INPUT_DIR = path_arg files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf')]) else: sys.exit("Error: Input must be a directory or a PDF file.") OUTPUT_DIR = os.path.join(INPUT_DIR, 'Cutleft') if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) # --- Processing Logic --- def distribute_pages(total_pages, max_per_file=5): """ Calculates how to split pages into chunks <= max_per_file, balancing the number of columns per file. Example: 12 pages, max 5 -> [4, 4, 4] """ if total_pages == 0: return [] # Calculate minimum number of files needed num_files = (total_pages + max_per_file - 1) // max_per_file # Calculate base size and remainder base_count = total_pages // num_files remainder = total_pages % num_files distribution = [] for i in range(num_files): # Distribute remainder to the first few files count = base_count + (1 if i < remainder else 0) distribution.append(count) return distribution def stitch_images(image_list): """Helper to stitch a list of images horizontally with delimiters.""" if not image_list: return None num_images = len(image_list) total_width = sum(img.width for img in image_list) + (num_images - 1) * DELIMITER_WIDTH max_height = max(img.height for img in image_list) combined = Image.new('RGB', (total_width, max_height), color=(255, 255, 255)) x_offset = 0 for idx, img in enumerate(image_list): combined.paste(img, (x_offset, 0)) x_offset += img.width if idx < num_images - 1: delimiter = Image.new('RGB', (DELIMITER_WIDTH, max_height), color=DELIMITER_COLOR) combined.paste(delimiter, (x_offset, 0)) x_offset += DELIMITER_WIDTH return combined @lru_cache(maxsize=3) def get_pdf_pages(filename): """Caches the heavy PDF rendering step for the current and next files.""" pdf_path = os.path.join(INPUT_DIR, filename) return convert_from_path(pdf_path) def process_single_pdf(filename, shift_offset=0, max_per_file=5): """ Converts PDF to stitched images. Returns a tuple: (preview_image_resized, list_of_split_images, schema_dict) """ # pdf_path = os.path.join(INPUT_DIR, filename) try: pages = get_pdf_pages(filename) cropped_images = [] for img in pages: width, height = img.size if max_per_file == 1: # If Single Page mode, take the full width (ignore shift/crop) left = 0 right = width else: # Original "Cutleft" logic (approx 1/3 width) left = 100 + shift_offset right = (width // 3) + 100 + shift_offset # Ensure crop box is valid left = max(0, left) right = min(width, right) if right > left: crop_box = (left, 0, right, height) cropped = img.crop(crop_box) cropped_images.append(cropped) if not cropped_images: return None # 1. Generate Schema / Distribution col_distribution = distribute_pages(len(cropped_images), max_per_file=max_per_file) # 2. Generate Split Images (Full Resolution) split_images = [] current_idx = 0 for count in col_distribution: chunk = cropped_images[current_idx : current_idx + count] stitched_chunk = stitch_images(chunk) split_images.append(stitched_chunk) current_idx += count # 3. Generate Preview (All stitched together, Resized) full_stitch = stitch_images(cropped_images) preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.LANCZOS) schema = { "original_filename": filename, "total_pages": len(cropped_images), "number_of_files": len(split_images), "columns_per_file": col_distribution } return (preview_resized, split_images, schema) except Exception as e: print(f"Error processing {filename}: {e}") return None def save_results(result_tuple, filename): """ Saves the split images and the schema JSON. """ _, splits, schema = result_tuple base_name = os.path.splitext(filename)[0] # --- Cleanup: Delete existing files for this PDF --- for f in os.listdir(OUTPUT_DIR): file_path = os.path.join(OUTPUT_DIR, f) # 1. Delete schema file if f == f"{base_name}_schema.json": os.remove(file_path) # 2. Delete image files (pattern: basename_01.jpg, etc.) elif f.startswith(f"{base_name}_") and f.endswith(".jpg"): # Check if the suffix is strictly numeric (e.g. "01") to avoid # deleting unrelated files like "file_v2_01.jpg" when processing "file.pdf" suffix = f[len(base_name)+1:-4] if suffix.isdigit(): os.remove(file_path) # --------------------------------------------------- # Save Images for i, img in enumerate(splits): # Suffix _01, _02, etc. suffix = f"_{i+1:02d}" output_filename = f"{base_name}{suffix}.jpg" output_path = os.path.join(OUTPUT_DIR, output_filename) img.save(output_path, "JPEG", quality=95) print(f"Saved: {output_filename}") # Save Schema json_filename = f"{base_name}_schema.json" json_path = os.path.join(OUTPUT_DIR, json_filename) with open(json_path, 'w') as f: json.dump(schema, f, indent=4) print(f"Saved schema: {json_filename}") # --- GUI Application --- class ImageReviewer: def __init__(self, file_list): self.files = file_list self.index = 0 self.current_shift = 0 self.current_max_per_file = 5 self.current_preview = None # Only stores the resized preview for GUI self.is_processing = False # Queue for pre-fetched results (index, (preview, splits, schema)) self.prefetch_queue = Queue(maxsize=1) # Queue for manual re-processing results self.manual_queue = Queue() # Setup GUI self.root = tk.Tk() self.root.title("PDF Cropper") self.root.geometry("+100+100") self.label_img = tk.Label(self.root) self.label_img.pack() self.label_info = tk.Label(self.root, text="", font=("Arial", 12, "bold")) self.label_info.pack(pady=5) # Bindings self.root.bind('', self.on_next) self.root.bind('n', lambda e: self.on_shift(50)) self.root.bind('N', lambda e: self.on_shift(100)) self.root.bind('t', lambda e: self.on_shift(-50)) self.root.bind('1', lambda e: self.on_set_max_pages(1)) # New Binding # Start background pre-fetcher self.bg_thread = Thread(target=self.prefetch_worker, daemon=True) self.bg_thread.start() # Load first image self.load_current_image() self.root.lift() self.root.focus_force() self.root.mainloop() def on_set_max_pages(self, count): if self.is_processing: return self.current_max_per_file = count print(f"Setting max pages per file: {count}") # Trigger reprocessing with current settings self.trigger_processing(self.files[self.index], self.current_shift) def prefetch_worker(self): """Background thread to process the NEXT image constantly.""" idx_to_process = 0 while True: target = self.index + 1 if target < len(self.files): if idx_to_process != target: fname = self.files[target] result = process_single_pdf(fname, shift_offset=0) if result: self.prefetch_queue.put((target, result)) # Blocks if full idx_to_process = target time.sleep(0.1) def load_current_image(self, use_prefetch=False): if self.index >= len(self.files): print("All files processed.") self.root.destroy() return filename = self.files[self.index] self.is_processing = False result_found = None if use_prefetch and not self.prefetch_queue.empty(): q_idx, q_result = self.prefetch_queue.queue[0] if q_idx == self.index: _, result_found = self.prefetch_queue.get() self.current_shift = 0 print(f"Loaded {filename} from prefetch.") if result_found: self.handle_processing_result(result_found, filename) else: # Not in queue (first load or queue mismatch), process manually self.trigger_processing(filename, self.current_shift) def trigger_processing(self, filename, shift): """Starts a thread to process image so GUI doesn't freeze.""" self.is_processing = True self.label_info.configure(text=f"Processing {filename} (Shift {shift})... Please wait.", fg="red") def worker(): res = process_single_pdf(filename, shift, self.current_max_per_file) self.manual_queue.put(res) Thread(target=worker, daemon=True).start() self.check_manual_queue(filename) def check_manual_queue(self, filename): """Polls the manual queue for result.""" try: result = self.manual_queue.get_nowait() if result: self.handle_processing_result(result, filename) else: print(f"Failed to process {filename}, skipping.") self.index += 1 self.load_current_image(use_prefetch=True) self.is_processing = False except Empty: # Check again in 100ms self.root.after(100, lambda: self.check_manual_queue(filename)) def handle_processing_result(self, result, filename): """Unpacks result, saves files, and updates display.""" preview, splits, schema = result self.current_preview = preview # Save in a background thread so the GUI updates instantly Thread(target=save_results, args=(result, filename), daemon=True).start() self.update_display(filename, schema) def update_display(self, filename, schema=None): if self.current_preview: tk_image = ImageTk.PhotoImage(self.current_preview) self.label_img.configure(image=tk_image) self.label_img.image = tk_image schema_info = "" if schema: cols = str(schema['columns_per_file']) schema_info = f"\nFiles: {schema['number_of_files']} | Cols: {cols}" self.label_info.configure( text=f"[{self.index+1}/{len(self.files)}] {filename} | Shift: {self.current_shift}px" f"{schema_info}\n" f"Enter: Next | n: +50 | N: +100 | t: -50 | 1: use single column", fg="black" ) def on_shift(self, amount): if self.is_processing: return # Ignore keys while processing self.current_shift += amount print(f"Applying shift: {self.current_shift}") self.trigger_processing(self.files[self.index], self.current_shift) def on_next(self, event): if self.is_processing: return self.index += 1 self.current_shift = 0 self.current_max_per_file = 5 # Reset to default self.load_current_image(use_prefetch=True) # --- Entry Point --- if __name__ == "__main__": if not files: print("No PDF files found.") else: app = ImageReviewer(files)