import fitz # PyMuPDF from pypdf import PdfWriter from pypdf import PdfReader import os import sys import json import shutil from pathlib import Path from collections import defaultdict def decode_json(pdf_file): file_path = Path(pdf_file) with open(file_path.with_suffix(".json"), "r") as f: json_result = json.load(f) nb_pages = len(PdfReader(file_path).pages) bb_list = json_result["list"] name = json_result["name"] column_width = 1000 // nb_pages def page_number(b): return ((b[1] + b[3]) // 2) // column_width result = [] for d in bb_list: (b, label) = d["box_2d"], d["label"] pn = page_number(b) carreau = 1000 // 38 result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau))) result.sort(key=lambda x: (x[1], x[2])) return (name, result) def split_an_interro(base_dir, input_pdf, coords_list): doc = fitz.open(input_pdf) output_dir = base_dir / input_pdf.stem generated_files = set() parts_by_label = defaultdict(list) # Filter consecutive duplicate labels filtered_coords = [] if coords_list: filtered_coords.append(coords_list[0]) for item in coords_list[1:]: if item[0] != filtered_coords[-1][0]: filtered_coords.append(item) coords_list = filtered_coords def scale_coord(y, page): """Scale y from 0–1000 range to PDF points.""" page_height = page.rect.height return (y / 1000) * page_height def save_cropped_page(doc, page_num, y0, y1, out_path): """Saves a cropped portion of a page as a new PDF.""" page = doc[page_num] rotated_rect = page.rect * page.transformation_matrix visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1) unrotated_clip_rect = visual_crop_rect * page.derotation_matrix temp_doc = fitz.open() temp_page = temp_doc.new_page( width=visual_crop_rect.width, height=visual_crop_rect.height ) temp_page.show_pdf_page( temp_page.rect, doc, page_num, rotate=-page.rotation, clip=unrotated_clip_rect ) temp_doc.save(out_path) temp_doc.close() # Iterate through all labels for idx, (title, start_page, y_start_raw, _) in enumerate(coords_list): temp_parts = [] # Determine the stopping point for this label if idx + 1 < len(coords_list): # Normal case: stop at the next label _, end_page, _, y_end_raw = coords_list[idx + 1] end_y_target_raw = y_end_raw else: # FIX BUG 1: Last label extends to the very end of the document end_page = doc.page_count - 1 end_y_target_raw = 1000 # 1000 represents full height # FIX BUG 2: Iterate through EVERY page from start to end # This handles cases where start_page == end_page, start_page + 1 == end_page, # AND start_page + N == end_page (gaps) current_p = start_page while current_p <= end_page: # Determine Top Cut (y0) if current_p == start_page: y0 = scale_coord(y_start_raw, doc[current_p]) else: y0 = 0 # Start from top of page for intermediate/last pages # Determine Bottom Cut (y1) if current_p == end_page: y1 = scale_coord(end_y_target_raw, doc[current_p]) else: y1 = doc[current_p].rect.height # Go to bottom of intermediate pages # Only save if the slice has height (avoid empty files) if y1 > y0 + 1: temp_path = f"_part_{idx}_{current_p}.pdf" save_cropped_page(doc, current_p, y0, y1, temp_path) temp_parts.append(temp_path) current_p += 1 parts_by_label[title].extend(temp_parts) output_dir.mkdir(parents=True, exist_ok=True) # Process aggregated parts by label for title, parts in parts_by_label.items(): merger = PdfWriter() for part in parts: if os.path.exists(part): merger.append(part) filename = f"{title}.pdf" merger.write(output_dir / filename) merger.close() generated_files.add(filename) # Cleanup for part in parts: if os.path.exists(part): os.remove(part) doc.close() # Move files not generated in this run to 'Missing' folder if output_dir.exists(): missing_dir = output_dir / "Missing" for item in output_dir.iterdir(): if item.is_file() and item.name not in generated_files: print(f"ALERT: File '{item.name}' not generated. Moving to {missing_dir}") missing_dir.mkdir(exist_ok=True) item.rename(missing_dir / item.name) if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python scrit.py ") sys.exit(1) input_arg = Path(sys.argv[1]) if input_arg.is_file(): base_dir = input_arg.parent pdf_files = [input_arg] elif input_arg.is_dir(): base_dir = input_arg pdf_files = sorted(base_dir.glob("*.pdf")) else: print(f"Error: {input_arg} is not a valid file or directory.") sys.exit(1) for pdf_path in pdf_files: json_path = pdf_path.with_suffix(".json") if json_path.exists(): (name, coords) = decode_json(pdf_path) print("Decoded name : ", name) split_an_interro(base_dir, pdf_path, coords) else: print(f"Warning: No JSON found for {pdf_path.name}")