Copies/cutleft.py

import sys
import os
import time
import json  # Added for schema output
import tkinter as tk
from threading import Thread
from queue import Queue, Empty
from pdf2image import convert_from_path
from PIL import Image, ImageTk

# --- Configuration ---
DELIMITER_WIDTH = 5
DELIMITER_COLOR = (0, 0, 0)
OUTPUT_SIZE = (1800, 1000)

if len(sys.argv) < 2:
    sys.exit("Usage: python script.py <directory_path_or_file_path>")

path_arg = sys.argv[1]
files = []
INPUT_DIR = ""

if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'):
    INPUT_DIR = os.path.dirname(path_arg)
    files = [os.path.basename(path_arg)]
elif os.path.isdir(path_arg):
    INPUT_DIR = path_arg
    files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf')])
else:
    sys.exit("Error: Input must be a directory or a PDF file.")

OUTPUT_DIR = os.path.join(INPUT_DIR, 'Cutleft')

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# --- Processing Logic ---

def distribute_pages(total_pages, max_per_file=5):
    """
    Calculates how to split pages into chunks <= max_per_file,
    balancing the number of columns per file.
    Example: 12 pages, max 5 -> [4, 4, 4]
    """
    if total_pages == 0:
        return []

    # Calculate minimum number of files needed
    num_files = (total_pages + max_per_file - 1) // max_per_file

    # Calculate base size and remainder
    base_count = total_pages // num_files
    remainder = total_pages % num_files

    distribution = []
    for i in range(num_files):
        # Distribute remainder to the first few files
        count = base_count + (1 if i < remainder else 0)
        distribution.append(count)

    return distribution

def stitch_images(image_list):
    """Helper to stitch a list of images horizontally with delimiters."""
    if not image_list:
        return None

    num_images = len(image_list)
    total_width = sum(img.width for img in image_list) + (num_images - 1) * DELIMITER_WIDTH
    max_height = max(img.height for img in image_list)

    combined = Image.new('RGB', (total_width, max_height), color=(255, 255, 255))

    x_offset = 0
    for idx, img in enumerate(image_list):
        combined.paste(img, (x_offset, 0))
        x_offset += img.width
        if idx < num_images - 1:
            delimiter = Image.new('RGB', (DELIMITER_WIDTH, max_height), color=DELIMITER_COLOR)
            combined.paste(delimiter, (x_offset, 0))
            x_offset += DELIMITER_WIDTH

    return combined

def process_single_pdf(filename, shift_offset=0, max_per_file=5):
    """
    Converts PDF to stitched images.
    Returns a tuple: (preview_image_resized, list_of_split_images, schema_dict)
    """
    pdf_path = os.path.join(INPUT_DIR, filename)
    try:
        pages = convert_from_path(pdf_path)
        cropped_images = []

        for img in pages:
            width, height = img.size

            if max_per_file == 1:
                # If Single Page mode, take the full width (ignore shift/crop)
                left = 0
                right = width
            else:
                # Original "Cutleft" logic (approx 1/3 width)
                left = 100 + shift_offset
                right = (width // 3) + 100 + shift_offset

            # Ensure crop box is valid
            left = max(0, left)
            right = min(width, right)

            if right > left:
                crop_box = (left, 0, right, height)
                cropped = img.crop(crop_box)
                cropped_images.append(cropped)

        if not cropped_images:
            return None

        # 1. Generate Schema / Distribution
        col_distribution = distribute_pages(len(cropped_images), max_per_file=max_per_file)

        # 2. Generate Split Images (Full Resolution)
        split_images = []
        current_idx = 0
        for count in col_distribution:
            chunk = cropped_images[current_idx : current_idx + count]
            stitched_chunk = stitch_images(chunk)
            split_images.append(stitched_chunk)
            current_idx += count

        # 3. Generate Preview (All stitched together, Resized)
        full_stitch = stitch_images(cropped_images)
        preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.LANCZOS)

        schema = {
            "original_filename": filename,
            "total_pages": len(cropped_images),
            "number_of_files": len(split_images),
            "columns_per_file": col_distribution
        }

        return (preview_resized, split_images, schema)

    except Exception as e:
        print(f"Error processing {filename}: {e}")
        return None

def save_results(result_tuple, filename):
    """
    Saves the split images and the schema JSON.
    """
    _, splits, schema = result_tuple
    base_name = os.path.splitext(filename)[0]

    # --- Cleanup: Delete existing files for this PDF ---
    for f in os.listdir(OUTPUT_DIR):
        file_path = os.path.join(OUTPUT_DIR, f)
        # 1. Delete schema file
        if f == f"{base_name}_schema.json":
            os.remove(file_path)
        # 2. Delete image files (pattern: basename_01.jpg, etc.)
        elif f.startswith(f"{base_name}_") and f.endswith(".jpg"):
            # Check if the suffix is strictly numeric (e.g. "01") to avoid
            # deleting unrelated files like "file_v2_01.jpg" when processing "file.pdf"
            suffix = f[len(base_name)+1:-4]
            if suffix.isdigit():
                os.remove(file_path)
    # ---------------------------------------------------

    # Save Images
    for i, img in enumerate(splits):
        # Suffix _01, _02, etc.
        suffix = f"_{i+1:02d}"
        output_filename = f"{base_name}{suffix}.jpg"
        output_path = os.path.join(OUTPUT_DIR, output_filename)
        img.save(output_path, "JPEG", quality=95)
        print(f"Saved: {output_filename}")

    # Save Schema
    json_filename = f"{base_name}_schema.json"
    json_path = os.path.join(OUTPUT_DIR, json_filename)
    with open(json_path, 'w') as f:
        json.dump(schema, f, indent=4)
    print(f"Saved schema: {json_filename}")
# --- GUI Application ---

class ImageReviewer:
    def __init__(self, file_list):
        self.files = file_list
        self.index = 0
        self.current_shift = 0
        self.current_max_per_file = 5
        self.current_preview = None # Only stores the resized preview for GUI
        self.is_processing = False

        # Queue for pre-fetched results (index, (preview, splits, schema))
        self.prefetch_queue = Queue(maxsize=1)
        # Queue for manual re-processing results
        self.manual_queue = Queue()

        # Setup GUI
        self.root = tk.Tk()
        self.root.title("PDF Cropper")
        self.root.geometry("+100+100")

        self.label_img = tk.Label(self.root)
        self.label_img.pack()

        self.label_info = tk.Label(self.root, text="", font=("Arial", 12, "bold"))
        self.label_info.pack(pady=5)

        # Bindings
        self.root.bind('<Return>', self.on_next)
        self.root.bind('n', lambda e: self.on_shift(50))
        self.root.bind('N', lambda e: self.on_shift(100))
        self.root.bind('t', lambda e: self.on_shift(-50))
        self.root.bind('1', lambda e: self.on_set_max_pages(1)) # New Binding


        # Start background pre-fetcher
        self.bg_thread = Thread(target=self.prefetch_worker, daemon=True)
        self.bg_thread.start()

        # Load first image
        self.load_current_image()

        self.root.lift()
        self.root.focus_force()
        self.root.mainloop()

    def on_set_max_pages(self, count):
        if self.is_processing:
            return
        self.current_max_per_file = count
        print(f"Setting max pages per file: {count}")
        # Trigger reprocessing with current settings
        self.trigger_processing(self.files[self.index], self.current_shift)

    def prefetch_worker(self):
        """Background thread to process the NEXT image constantly."""
        idx_to_process = 0
        while True:
            target = self.index + 1
            if target < len(self.files):
                if idx_to_process != target:
                    fname = self.files[target]
                    result = process_single_pdf(fname, shift_offset=0)
                    if result:
                        self.prefetch_queue.put((target, result)) # Blocks if full
                        idx_to_process = target

            time.sleep(0.1)

    def load_current_image(self, use_prefetch=False):
        if self.index >= len(self.files):
            print("All files processed.")
            self.root.destroy()
            return

        filename = self.files[self.index]
        self.is_processing = False

        result_found = None

        if use_prefetch and not self.prefetch_queue.empty():
            q_idx, q_result = self.prefetch_queue.queue[0]
            if q_idx == self.index:
                _, result_found = self.prefetch_queue.get()
                self.current_shift = 0
                print(f"Loaded {filename} from prefetch.")

        if result_found:
            self.handle_processing_result(result_found, filename)
        else:
            # Not in queue (first load or queue mismatch), process manually
            self.trigger_processing(filename, self.current_shift)

    def trigger_processing(self, filename, shift):
        """Starts a thread to process image so GUI doesn't freeze."""
        self.is_processing = True
        self.label_info.configure(text=f"Processing {filename} (Shift {shift})... Please wait.", fg="red")

        def worker():
            res = process_single_pdf(filename, shift, self.current_max_per_file)
            self.manual_queue.put(res)

        Thread(target=worker, daemon=True).start()
        self.check_manual_queue(filename)

    def check_manual_queue(self, filename):
        """Polls the manual queue for result."""
        try:
            result = self.manual_queue.get_nowait()
            if result:
                self.handle_processing_result(result, filename)
            else:
                print(f"Failed to process {filename}, skipping.")
                self.index += 1
                self.load_current_image(use_prefetch=True)
            self.is_processing = False
        except Empty:
            # Check again in 100ms
            self.root.after(100, lambda: self.check_manual_queue(filename))

    def handle_processing_result(self, result, filename):
        """Unpacks result, saves files, and updates display."""
        preview, splits, schema = result
        self.current_preview = preview

        # Save in a background thread so the GUI updates instantly
        Thread(target=save_results, args=(result, filename), daemon=True).start()

        self.update_display(filename, schema)

    def update_display(self, filename, schema=None):
        if self.current_preview:
            tk_image = ImageTk.PhotoImage(self.current_preview)
            self.label_img.configure(image=tk_image)
            self.label_img.image = tk_image

            schema_info = ""
            if schema:
                cols = str(schema['columns_per_file'])
                schema_info = f"\nFiles: {schema['number_of_files']} | Cols: {cols}"

            self.label_info.configure(
                text=f"[{self.index+1}/{len(self.files)}] {filename} | Shift: {self.current_shift}px"
                     f"{schema_info}\n"
                     f"Enter: Next | n: +50 | N: +100 | t: -50 | 1: use single column",
                fg="black"
            )

    def on_shift(self, amount):
        if self.is_processing:
            return # Ignore keys while processing
        self.current_shift += amount
        print(f"Applying shift: {self.current_shift}")
        self.trigger_processing(self.files[self.index], self.current_shift)

    def on_next(self, event):
        if self.is_processing:
            return
        self.index += 1
        self.current_shift = 0
        self.current_max_per_file = 5 # Reset to default
        self.load_current_image(use_prefetch=True)

# --- Entry Point ---
if __name__ == "__main__":
    if not files:
        print("No PDF files found.")
    else:
        app = ImageReviewer(files)