import fitz # PyMuPDF from pypdf import PdfWriter from pypdf import PdfReader import os import sys import json import shutil from pathlib import Path from collections import defaultdict carreau = 1000 // 38 def decode_json(pdf_file): file_path = Path(pdf_file) with open(file_path.with_suffix(".json"), "r") as f: json_result = json.load(f) nb_pages = len(PdfReader(file_path).pages) bb_list = json_result["list"] name = json_result["name"] column_width = 1000 // nb_pages def page_number(b): return ((b[1] + b[3]) // 2) // column_width result = [] for d in bb_list: (b, label) = d["box_2d"], d["label"] pn = page_number(b) result.append((label, pn, b[0] - carreau, b[2]-carreau, b[1], b[3])) result.sort(key=lambda x: (x[1], x[2])) return (name, result) def split_an_interro(base_dir, input_pdf, coords_list): doc = fitz.open(input_pdf) output_dir = base_dir / input_pdf.stem generated_files = set() parts_by_label = defaultdict(list) # 1. Parse labels to strip '|' and determine type: L (Left), R (Right), N (Normal) parsed_coords = [] for item in coords_list: label, pn, y0, y1, x0, x1 = item if label.startswith("|"): c_type, clean_label = "L", label[1:] elif label.endswith("|"): c_type, clean_label = "R", label[:-1] else: c_type, clean_label = "N", label parsed_coords.append((clean_label, c_type, pn, y0, y1, x0, x1)) # 2. Filter consecutive duplicate labels based on the cleaned name filtered_coords = [] if parsed_coords: filtered_coords.append(parsed_coords[0]) for item in parsed_coords[1:]: if item[0] != filtered_coords[-1][0]: filtered_coords.append(item) coords_list = filtered_coords def scale_coord(y, page): """Scale y from 0–1000 range to PDF points.""" page_height = page.rect.height return (y / 1000) * page_height def save_cropped_page(doc, page_num, x0, y0, x1, y1, out_path): """Saves a cropped portion of a page as a new PDF.""" page = doc[page_num] rotated_rect = page.rect * page.transformation_matrix visual_crop_rect = fitz.Rect(rotated_rect.x0 + x0, y0, rotated_rect.x0 + x1, y1) unrotated_clip_rect = visual_crop_rect * page.derotation_matrix temp_doc = fitz.open() temp_page = temp_doc.new_page( width=visual_crop_rect.width, height=visual_crop_rect.height ) temp_page.show_pdf_page( temp_page.rect, doc, page_num, rotate=-page.rotation, clip=unrotated_clip_rect ) temp_doc.save(out_path) temp_doc.close() # Iterate through all labels for idx, (clean_label, c_type, start_page, y_start_raw, y_end_box, x0_raw, x1_raw) in enumerate(coords_list): if clean_label == "_": continue temp_parts = [] end_page = doc.page_count - 1 end_y_target_raw = 1000 # RULE 2: Determine stopping label for next_item in coords_list[idx + 1:]: n_clean, n_type, n_pn, n_y_start, n_y_end, _, _ = next_item if c_type == "L": is_stop = (n_type in ("L", "N")) elif c_type == "R": is_stop = (n_type in ("R", "N")) else: is_stop = True # Normal labels stop at anything if is_stop: end_page = n_pn # end_y_target_raw = n_y_start # On avait retiré un carreau précédemment, on le rajoute… end_y_target_raw = min(n_y_start + int(1.25 * carreau), 1000) break # RULES 3 & 4: Calculate horizontal boundaries (0.0 to 1.0 fraction of local page width) col_w = 1000 / doc.page_count if c_type == "L": # |name fraction_x0 = (x0_raw % col_w) / col_w fraction_x1 = 1.0 end_y_target_raw = min(1000, end_y_target_raw + 40) elif c_type == "R": # name| fraction_x0 = 0.0 # Find the closest 'L' label in y-distance L_labels = [it for it in parsed_coords if it[1] == "L"] if L_labels: closest_L = min(L_labels, key=lambda it: abs(it[3] - y_start_raw)) closest_L_x_center = (closest_L[5] + closest_L[6]) / 2.0 fraction_x1 = (closest_L_x_center % col_w) / col_w if fraction_x1 <= fraction_x0: fraction_x1 = 1.0 # Fallback else: fraction_x1 = 1.0 else: # Normal fraction_x0 = 0.0 fraction_x1 = 1.0 current_p = start_page while current_p <= end_page: page = doc[current_p] y0 = scale_coord(y_start_raw, page) if current_p == start_page else 0 y1 = scale_coord(end_y_target_raw, page) if current_p == end_page else page.rect.height if y1 > y0 + 1: # Convert fractions to absolute PDF points x0_pdf = fraction_x0 * page.rect.width x1_pdf = fraction_x1 * page.rect.width temp_path = f"_part_{idx}_{current_p}.pdf" save_cropped_page(doc, current_p, x0_pdf, y0, x1_pdf, y1, temp_path) temp_parts.append(temp_path) current_p += 1 parts_by_label[clean_label].extend(temp_parts) output_dir.mkdir(parents=True, exist_ok=True) # Process aggregated parts by label for title, parts in parts_by_label.items(): merger = PdfWriter() for part in parts: if os.path.exists(part): merger.append(part) filename = f"{title}.pdf" merger.write(output_dir / filename) merger.close() generated_files.add(filename) # Cleanup for part in parts: if os.path.exists(part): os.remove(part) doc.close() # Move files not generated in this run to 'Missing' folder if output_dir.exists(): missing_dir = output_dir / "Missing" for item in output_dir.iterdir(): if item.is_file() and item.name not in generated_files: print(f"ALERT: File '{item.name}' not generated. Moving to {missing_dir}") missing_dir.mkdir(exist_ok=True) item.rename(missing_dir / item.name) if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python script.py ") sys.exit(1) input_arg = Path(sys.argv[1]) if input_arg.is_file(): base_dir = input_arg.parent pdf_files = [input_arg] elif input_arg.is_dir(): base_dir = input_arg pdf_files = sorted(base_dir.glob("*.pdf")) else: print(f"Error: {input_arg} is not a valid file or directory.") sys.exit(1) for pdf_path in pdf_files: json_path = pdf_path.with_suffix(".json") # print("Debug :", json_path) if json_path.exists(): (name, coords) = decode_json(pdf_path) print("Decoded name : ", name) split_an_interro(base_dir, pdf_path, coords) else: print(f"Warning: No JSON found for {pdf_path.name}")