Copies/splitting_int.py

import fitz  # PyMuPDF
from pypdf import PdfWriter
from pypdf import PdfReader
import os
import sys
import json
import shutil
from pathlib import Path
from collections import defaultdict

def decode_json(pdf_file):
    file_path = Path(pdf_file)
    with open(file_path.with_suffix(".json"), "r") as f:
        json_result = json.load(f)

    nb_pages = len(PdfReader(file_path).pages)

    bb_list = json_result["list"]
    name = json_result["name"]
    column_width = 1000 // nb_pages

    def page_number(b):
        return ((b[1] + b[3]) // 2) // column_width

    result = []
    for d in bb_list:
        (b, label) = d["box_2d"], d["label"]
        pn = page_number(b)
        carreau = 1000 // 38
        result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau), b[1], b[3]))
    result.sort(key=lambda x: (x[1], x[2]))
    return (name, result)


def split_an_interro(base_dir, input_pdf, coords_list):
    doc = fitz.open(input_pdf)

    output_dir = base_dir / input_pdf.stem
    generated_files = set()
    parts_by_label = defaultdict(list)

    # 1. Parse labels to strip '|' and determine type: L (Left), R (Right), N (Normal)
    parsed_coords = []
    for item in coords_list:
        label, pn, y0, y1, x0, x1 = item
        if label.startswith("|"):
            c_type, clean_label = "L", label[1:]
        elif label.endswith("|"):
            c_type, clean_label = "R", label[:-1]
        else:
            c_type, clean_label = "N", label
        parsed_coords.append((clean_label, c_type, pn, y0, y1, x0, x1))

    # 2. Filter consecutive duplicate labels based on the cleaned name
    filtered_coords = []
    if parsed_coords:
        filtered_coords.append(parsed_coords[0])
        for item in parsed_coords[1:]:
            if item[0] != filtered_coords[-1][0]:
                filtered_coords.append(item)
    coords_list = filtered_coords

    def scale_coord(y, page):
        """Scale y from 0–1000 range to PDF points."""
        page_height = page.rect.height
        return (y / 1000) * page_height

    def save_cropped_page(doc, page_num, x0, y0, x1, y1, out_path):
        """Saves a cropped portion of a page as a new PDF."""
        page = doc[page_num]
        rotated_rect = page.rect * page.transformation_matrix
        visual_crop_rect = fitz.Rect(rotated_rect.x0 + x0, y0, rotated_rect.x0 + x1, y1)
        unrotated_clip_rect = visual_crop_rect * page.derotation_matrix

        temp_doc = fitz.open()
        temp_page = temp_doc.new_page(
            width=visual_crop_rect.width,
            height=visual_crop_rect.height
        )
        temp_page.show_pdf_page(
            temp_page.rect,
            doc,
            page_num,
            rotate=-page.rotation,
            clip=unrotated_clip_rect
        )
        temp_doc.save(out_path)
        temp_doc.close()

    # Iterate through all labels
    for idx, (clean_label, c_type, start_page, y_start_raw, y_end_box, x0_raw, x1_raw) in enumerate(coords_list):
        if clean_label == "_":
            continue

        temp_parts = []
        end_page = doc.page_count - 1
        end_y_target_raw = 1000

        # RULE 2: Determine stopping label
        for next_item in coords_list[idx + 1:]:
            n_clean, n_type, n_pn, n_y_start, _, _, _ = next_item

            if c_type == "L":
                is_stop = (n_type in ("L", "N"))
            elif c_type == "R":
                is_stop = (n_type in ("R", "N"))
            else:
                is_stop = True # Normal labels stop at anything

            if is_stop:
                end_page = n_pn
                end_y_target_raw = n_y_start
                break

        # RULES 3 & 4: Calculate horizontal boundaries (0.0 to 1.0 fraction of local page width)
        col_w = 1000 / doc.page_count
        if c_type == "L": # |name
            fraction_x0 = (x0_raw % col_w) / col_w
            fraction_x1 = 1.0
            end_y_target_raw = min(1000, end_y_target_raw + 40)
        elif c_type == "R": # name|
            fraction_x0 = 0.0
            # Find the closest 'L' label in y-distance
            L_labels = [it for it in parsed_coords if it[1] == "L"]
            if L_labels:
                closest_L = min(L_labels, key=lambda it: abs(it[3] - y_start_raw))
                closest_L_x_center = (closest_L[5] + closest_L[6]) / 2.0
                fraction_x1 = (closest_L_x_center % col_w) / col_w
                if fraction_x1 <= fraction_x0: fraction_x1 = 1.0 # Fallback
            else:
                fraction_x1 = 1.0
        else: # Normal
            fraction_x0 = 0.0
            fraction_x1 = 1.0

        current_p = start_page
        while current_p <= end_page:
            page = doc[current_p]

            y0 = scale_coord(y_start_raw, page) if current_p == start_page else 0
            y1 = scale_coord(end_y_target_raw, page) if current_p == end_page else page.rect.height

            if y1 > y0 + 1:
                # Convert fractions to absolute PDF points
                x0_pdf = fraction_x0 * page.rect.width
                x1_pdf = fraction_x1 * page.rect.width

                temp_path = f"_part_{idx}_{current_p}.pdf"
                save_cropped_page(doc, current_p, x0_pdf, y0, x1_pdf, y1, temp_path)
                temp_parts.append(temp_path)

            current_p += 1

        parts_by_label[clean_label].extend(temp_parts)

    output_dir.mkdir(parents=True, exist_ok=True)

    # Process aggregated parts by label
    for title, parts in parts_by_label.items():
        merger = PdfWriter()
        for part in parts:
            if os.path.exists(part):
                merger.append(part)

        filename = f"{title}.pdf"
        merger.write(output_dir / filename)
        merger.close()
        generated_files.add(filename)

        # Cleanup
        for part in parts:
            if os.path.exists(part):
                os.remove(part)

    doc.close()

    # Move files not generated in this run to 'Missing' folder
    if output_dir.exists():
        missing_dir = output_dir / "Missing"
        for item in output_dir.iterdir():
            if item.is_file() and item.name not in generated_files:
                print(f"ALERT: File '{item.name}' not generated. Moving to {missing_dir}")
                missing_dir.mkdir(exist_ok=True)
                item.rename(missing_dir / item.name)


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python script.py <directory or pdf_file>")
        sys.exit(1)

    input_arg = Path(sys.argv[1])

    if input_arg.is_file():
        base_dir = input_arg.parent
        pdf_files = [input_arg]
    elif input_arg.is_dir():
        base_dir = input_arg
        pdf_files = sorted(base_dir.glob("*.pdf"))
    else:
        print(f"Error: {input_arg} is not a valid file or directory.")
        sys.exit(1)

    for pdf_path in pdf_files:
            json_path = pdf_path.with_suffix(".json")
            # print("Debug :", json_path)
            if json_path.exists():
                (name, coords) = decode_json(pdf_path)
                print("Decoded name : ", name)
                split_an_interro(base_dir, pdf_path, coords)
            else:
                print(f"Warning: No JSON found for {pdf_path.name}")