354 lines
12 KiB
Python
354 lines
12 KiB
Python
import sys
|
|
import os
|
|
import time
|
|
import json # Added for schema output
|
|
import tkinter as tk
|
|
from threading import Thread
|
|
from queue import Queue, Empty
|
|
from pdf2image import convert_from_path
|
|
from PIL import Image, ImageTk
|
|
|
|
# --- Configuration ---
|
|
DELIMITER_WIDTH = 5
|
|
DELIMITER_COLOR = (0, 0, 0)
|
|
OUTPUT_SIZE = (1800, 1000)
|
|
|
|
if len(sys.argv) < 2:
|
|
sys.exit("Usage: python script.py <directory_path_or_file_path>")
|
|
|
|
path_arg = sys.argv[1]
|
|
files = []
|
|
INPUT_DIR = ""
|
|
|
|
if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'):
|
|
INPUT_DIR = os.path.dirname(path_arg)
|
|
files = [os.path.basename(path_arg)]
|
|
elif os.path.isdir(path_arg):
|
|
INPUT_DIR = path_arg
|
|
files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf')])
|
|
else:
|
|
sys.exit("Error: Input must be a directory or a PDF file.")
|
|
|
|
OUTPUT_DIR = os.path.join(INPUT_DIR, 'Cutleft')
|
|
|
|
if not os.path.exists(OUTPUT_DIR):
|
|
os.makedirs(OUTPUT_DIR)
|
|
|
|
# --- Processing Logic ---
|
|
|
|
def distribute_pages(total_pages, max_per_file=5):
|
|
"""
|
|
Calculates how to split pages into chunks <= max_per_file,
|
|
balancing the number of columns per file.
|
|
Example: 12 pages, max 5 -> [4, 4, 4]
|
|
"""
|
|
if total_pages == 0:
|
|
return []
|
|
|
|
# Calculate minimum number of files needed
|
|
num_files = (total_pages + max_per_file - 1) // max_per_file
|
|
|
|
# Calculate base size and remainder
|
|
base_count = total_pages // num_files
|
|
remainder = total_pages % num_files
|
|
|
|
distribution = []
|
|
for i in range(num_files):
|
|
# Distribute remainder to the first few files
|
|
count = base_count + (1 if i < remainder else 0)
|
|
distribution.append(count)
|
|
|
|
return distribution
|
|
|
|
def stitch_images(image_list):
|
|
"""Helper to stitch a list of images horizontally with delimiters."""
|
|
if not image_list:
|
|
return None
|
|
|
|
num_images = len(image_list)
|
|
total_width = sum(img.width for img in image_list) + (num_images - 1) * DELIMITER_WIDTH
|
|
max_height = max(img.height for img in image_list)
|
|
|
|
combined = Image.new('RGB', (total_width, max_height), color=(255, 255, 255))
|
|
|
|
x_offset = 0
|
|
for idx, img in enumerate(image_list):
|
|
combined.paste(img, (x_offset, 0))
|
|
x_offset += img.width
|
|
if idx < num_images - 1:
|
|
delimiter = Image.new('RGB', (DELIMITER_WIDTH, max_height), color=DELIMITER_COLOR)
|
|
combined.paste(delimiter, (x_offset, 0))
|
|
x_offset += DELIMITER_WIDTH
|
|
|
|
return combined
|
|
|
|
def process_single_pdf(filename, shift_offset=0, max_per_file=5):
|
|
"""
|
|
Converts PDF to stitched images.
|
|
Returns a tuple: (preview_image_resized, list_of_split_images, schema_dict)
|
|
"""
|
|
pdf_path = os.path.join(INPUT_DIR, filename)
|
|
try:
|
|
pages = convert_from_path(pdf_path)
|
|
cropped_images = []
|
|
|
|
for img in pages:
|
|
width, height = img.size
|
|
|
|
if max_per_file == 1:
|
|
# If Single Page mode, take the full width (ignore shift/crop)
|
|
left = 0
|
|
right = width
|
|
else:
|
|
# Original "Cutleft" logic (approx 1/3 width)
|
|
left = 100 + shift_offset
|
|
right = (width // 3) + 100 + shift_offset
|
|
|
|
# Ensure crop box is valid
|
|
left = max(0, left)
|
|
right = min(width, right)
|
|
|
|
if right > left:
|
|
crop_box = (left, 0, right, height)
|
|
cropped = img.crop(crop_box)
|
|
cropped_images.append(cropped)
|
|
|
|
if not cropped_images:
|
|
return None
|
|
|
|
# 1. Generate Schema / Distribution
|
|
col_distribution = distribute_pages(len(cropped_images), max_per_file=max_per_file)
|
|
|
|
# 2. Generate Split Images (Full Resolution)
|
|
split_images = []
|
|
current_idx = 0
|
|
for count in col_distribution:
|
|
chunk = cropped_images[current_idx : current_idx + count]
|
|
stitched_chunk = stitch_images(chunk)
|
|
split_images.append(stitched_chunk)
|
|
current_idx += count
|
|
|
|
# 3. Generate Preview (All stitched together, Resized)
|
|
full_stitch = stitch_images(cropped_images)
|
|
preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.LANCZOS)
|
|
|
|
schema = {
|
|
"original_filename": filename,
|
|
"total_pages": len(cropped_images),
|
|
"number_of_files": len(split_images),
|
|
"columns_per_file": col_distribution
|
|
}
|
|
|
|
return (preview_resized, split_images, schema)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {filename}: {e}")
|
|
return None
|
|
|
|
def save_results(result_tuple, filename):
|
|
"""
|
|
Saves the split images and the schema JSON.
|
|
"""
|
|
_, splits, schema = result_tuple
|
|
base_name = os.path.splitext(filename)[0]
|
|
|
|
# --- Cleanup: Delete existing files for this PDF ---
|
|
for f in os.listdir(OUTPUT_DIR):
|
|
file_path = os.path.join(OUTPUT_DIR, f)
|
|
# 1. Delete schema file
|
|
if f == f"{base_name}_schema.json":
|
|
os.remove(file_path)
|
|
# 2. Delete image files (pattern: basename_01.jpg, etc.)
|
|
elif f.startswith(f"{base_name}_") and f.endswith(".jpg"):
|
|
# Check if the suffix is strictly numeric (e.g. "01") to avoid
|
|
# deleting unrelated files like "file_v2_01.jpg" when processing "file.pdf"
|
|
suffix = f[len(base_name)+1:-4]
|
|
if suffix.isdigit():
|
|
os.remove(file_path)
|
|
# ---------------------------------------------------
|
|
|
|
# Save Images
|
|
for i, img in enumerate(splits):
|
|
# Suffix _01, _02, etc.
|
|
suffix = f"_{i+1:02d}"
|
|
output_filename = f"{base_name}{suffix}.jpg"
|
|
output_path = os.path.join(OUTPUT_DIR, output_filename)
|
|
img.save(output_path, "JPEG", quality=95)
|
|
print(f"Saved: {output_filename}")
|
|
|
|
# Save Schema
|
|
json_filename = f"{base_name}_schema.json"
|
|
json_path = os.path.join(OUTPUT_DIR, json_filename)
|
|
with open(json_path, 'w') as f:
|
|
json.dump(schema, f, indent=4)
|
|
print(f"Saved schema: {json_filename}")
|
|
# --- GUI Application ---
|
|
|
|
class ImageReviewer:
|
|
def __init__(self, file_list):
|
|
self.files = file_list
|
|
self.index = 0
|
|
self.current_shift = 0
|
|
self.current_max_per_file = 5
|
|
self.current_preview = None # Only stores the resized preview for GUI
|
|
self.is_processing = False
|
|
|
|
# Queue for pre-fetched results (index, (preview, splits, schema))
|
|
self.prefetch_queue = Queue(maxsize=1)
|
|
# Queue for manual re-processing results
|
|
self.manual_queue = Queue()
|
|
|
|
# Setup GUI
|
|
self.root = tk.Tk()
|
|
self.root.title("PDF Cropper")
|
|
self.root.geometry("+100+100")
|
|
|
|
self.label_img = tk.Label(self.root)
|
|
self.label_img.pack()
|
|
|
|
self.label_info = tk.Label(self.root, text="", font=("Arial", 12, "bold"))
|
|
self.label_info.pack(pady=5)
|
|
|
|
# Bindings
|
|
self.root.bind('<Return>', self.on_next)
|
|
self.root.bind('n', lambda e: self.on_shift(50))
|
|
self.root.bind('N', lambda e: self.on_shift(100))
|
|
self.root.bind('t', lambda e: self.on_shift(-50))
|
|
self.root.bind('1', lambda e: self.on_set_max_pages(1)) # New Binding
|
|
|
|
|
|
# Start background pre-fetcher
|
|
self.bg_thread = Thread(target=self.prefetch_worker, daemon=True)
|
|
self.bg_thread.start()
|
|
|
|
# Load first image
|
|
self.load_current_image()
|
|
|
|
self.root.lift()
|
|
self.root.focus_force()
|
|
self.root.mainloop()
|
|
|
|
def on_set_max_pages(self, count):
|
|
if self.is_processing:
|
|
return
|
|
self.current_max_per_file = count
|
|
print(f"Setting max pages per file: {count}")
|
|
# Trigger reprocessing with current settings
|
|
self.trigger_processing(self.files[self.index], self.current_shift)
|
|
|
|
def prefetch_worker(self):
|
|
"""Background thread to process the NEXT image constantly."""
|
|
idx_to_process = 0
|
|
while True:
|
|
target = self.index + 1
|
|
if target < len(self.files):
|
|
if idx_to_process != target:
|
|
fname = self.files[target]
|
|
result = process_single_pdf(fname, shift_offset=0)
|
|
if result:
|
|
self.prefetch_queue.put((target, result)) # Blocks if full
|
|
idx_to_process = target
|
|
|
|
time.sleep(0.1)
|
|
|
|
def load_current_image(self, use_prefetch=False):
|
|
if self.index >= len(self.files):
|
|
print("All files processed.")
|
|
self.root.destroy()
|
|
return
|
|
|
|
filename = self.files[self.index]
|
|
self.is_processing = False
|
|
|
|
result_found = None
|
|
|
|
if use_prefetch and not self.prefetch_queue.empty():
|
|
q_idx, q_result = self.prefetch_queue.queue[0]
|
|
if q_idx == self.index:
|
|
_, result_found = self.prefetch_queue.get()
|
|
self.current_shift = 0
|
|
print(f"Loaded {filename} from prefetch.")
|
|
|
|
if result_found:
|
|
self.handle_processing_result(result_found, filename)
|
|
else:
|
|
# Not in queue (first load or queue mismatch), process manually
|
|
self.trigger_processing(filename, self.current_shift)
|
|
|
|
def trigger_processing(self, filename, shift):
|
|
"""Starts a thread to process image so GUI doesn't freeze."""
|
|
self.is_processing = True
|
|
self.label_info.configure(text=f"Processing {filename} (Shift {shift})... Please wait.", fg="red")
|
|
|
|
def worker():
|
|
res = process_single_pdf(filename, shift, self.current_max_per_file)
|
|
self.manual_queue.put(res)
|
|
|
|
Thread(target=worker, daemon=True).start()
|
|
self.check_manual_queue(filename)
|
|
|
|
def check_manual_queue(self, filename):
|
|
"""Polls the manual queue for result."""
|
|
try:
|
|
result = self.manual_queue.get_nowait()
|
|
if result:
|
|
self.handle_processing_result(result, filename)
|
|
else:
|
|
print(f"Failed to process {filename}, skipping.")
|
|
self.index += 1
|
|
self.load_current_image(use_prefetch=True)
|
|
self.is_processing = False
|
|
except Empty:
|
|
# Check again in 100ms
|
|
self.root.after(100, lambda: self.check_manual_queue(filename))
|
|
|
|
def handle_processing_result(self, result, filename):
|
|
"""Unpacks result, saves files, and updates display."""
|
|
preview, splits, schema = result
|
|
self.current_preview = preview
|
|
|
|
# Save in a background thread so the GUI updates instantly
|
|
Thread(target=save_results, args=(result, filename), daemon=True).start()
|
|
|
|
self.update_display(filename, schema)
|
|
|
|
def update_display(self, filename, schema=None):
|
|
if self.current_preview:
|
|
tk_image = ImageTk.PhotoImage(self.current_preview)
|
|
self.label_img.configure(image=tk_image)
|
|
self.label_img.image = tk_image
|
|
|
|
schema_info = ""
|
|
if schema:
|
|
cols = str(schema['columns_per_file'])
|
|
schema_info = f"\nFiles: {schema['number_of_files']} | Cols: {cols}"
|
|
|
|
self.label_info.configure(
|
|
text=f"[{self.index+1}/{len(self.files)}] {filename} | Shift: {self.current_shift}px"
|
|
f"{schema_info}\n"
|
|
f"Enter: Next | n: +50 | N: +100 | t: -50 | 1: use single column",
|
|
fg="black"
|
|
)
|
|
|
|
def on_shift(self, amount):
|
|
if self.is_processing:
|
|
return # Ignore keys while processing
|
|
self.current_shift += amount
|
|
print(f"Applying shift: {self.current_shift}")
|
|
self.trigger_processing(self.files[self.index], self.current_shift)
|
|
|
|
def on_next(self, event):
|
|
if self.is_processing:
|
|
return
|
|
self.index += 1
|
|
self.current_shift = 0
|
|
self.current_max_per_file = 5 # Reset to default
|
|
self.load_current_image(use_prefetch=True)
|
|
|
|
# --- Entry Point ---
|
|
if __name__ == "__main__":
|
|
if not files:
|
|
print("No PDF files found.")
|
|
else:
|
|
app = ImageReviewer(files)
|