Fix missing pages

2026-02-25 20:20:41 +01:00 · 2026-02-25 20:20:41 +01:00 · 6215b2aad2
parent a682e21e8b
commit 6215b2aad2
1 changed files with 50 additions and 75 deletions
--- a/splitting_int.py
+++ b/splitting_int.py
@ -6,17 +6,15 @@ import sys
 import json
 import shutil
 from pathlib import Path
-from collections import defaultdict  # Added for grouping
+from collections import defaultdict
 # input_pdf = "Une Interro/Split.pdf"
 def decode_json(pdf_file):
    file_path = Path(pdf_file)
    # Load JSON content from File.json
    with open(file_path.with_suffix(".json"), "r") as f:
        json_result = json.load(f)
    # Get number of pages from File.pdf
    nb_pages = len(PdfReader(file_path).pages)
    bb_list = json_result["list"]
@ -26,15 +24,11 @@ def decode_json(pdf_file):
    def page_number(b):
        return ((b[1] + b[3]) // 2) // column_width
-    result = [] # contient la page, et, en millième, au dessus du label
+    result = []
                # (marge en plus), et au dessus du label (marge en moins)
    for d in bb_list:
        (b, label) = d["box_2d"], d["label"]
        # print(b)
        pn = page_number(b)
        # 38 carreaux dans une page
        carreau = 1000 // 38
        # result.append((label, pn, b[2] - 3 * carreau, b[2] + int(carreau)))
        result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau)))
    result.sort(key=lambda x: (x[1], x[2]))
    return (name, result)
@ -45,18 +39,13 @@ def split_an_interro(base_dir,input_pdf, coords_list):
    output_dir = base_dir / input_pdf.stem
    generated_files = set()
    # Dictionary to collect parts for each label
    parts_by_label = defaultdict(list)
-    # Filter coords_list to remove consecutive duplicate labels.
+    # Filter consecutive duplicate labels
    # If a label appears at the end of a page and again at the start of the next,
    # we want to treat it as one continuous block, not two separate cuts.
    filtered_coords = []
    if coords_list:
        filtered_coords.append(coords_list[0])
        for item in coords_list[1:]:
            # item[0] is the label/title
            if item[0] != filtered_coords[-1][0]:
                filtered_coords.append(item)
    coords_list = filtered_coords
@ -67,79 +56,67 @@ def split_an_interro(base_dir,input_pdf, coords_list):
        return (y / 1000) * page_height
    def save_cropped_page(doc, page_num, y0, y1, out_path):
-        """
+        """Saves a cropped portion of a page as a new PDF."""
        Saves a cropped portion of a page as a new PDF,
        correctly handling the original page's rotation.
        """
        # print(f"Saving cropped_page with : {y0} and {y1}")
        # Get the source page object
        page = doc[page_num]
        # print("Debug : ", page_num, y0, y1, output_dir)
        # 1. Define the crop rectangle in the VISUAL (rotated) coordinate system.
        # The page.rect gives unrotated dimensions, so we apply the transformation
        # matrix to get the visual dimensions.
        rotated_rect = page.rect * page.transformation_matrix
        visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1)
        # 2. Transform this visual crop rectangle back into the UNROTATED system.
        # The 'clip' argument for show_pdf_page requires unrotated coordinates.
        # The derotation_matrix does this conversion for us.
        unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
        # Create a new temporary document for the output
        temp_doc = fitz.open()
        # Create a new page with the dimensions of our visual crop
        temp_page = temp_doc.new_page(
            width=visual_crop_rect.width,
            height=visual_crop_rect.height
        )
        # Display the cropped and de-rotated content on the new page
        temp_page.show_pdf_page(
-            temp_page.rect,      # Where to place the content on the new page (the whole page)
+            temp_page.rect,
-            doc,                 # Source document
+            doc,
            page_num,
-            rotate=-page.rotation,  # Cancel the original page's rotation
+            rotate=-page.rotation,
-            clip=unrotated_clip_rect  # The area to take from the source page
+            clip=unrotated_clip_rect
        )
        # Save the new one-page PDF and close the document
        temp_doc.save(out_path)
        temp_doc.close()
-
+    # Iterate through all labels
-    for idx, (title, page_nb, ymin, _) in enumerate(coords_list):
+    for idx, (title, start_page, y_start_raw, _) in enumerate(coords_list):
        temp_parts = []
        y_start = scale_coord(ymin, doc[page_nb])
        # Determine the stopping point for this label
        if idx + 1 < len(coords_list):
-            _, next_page_nb, _, next_ymax = coords_list[idx + 1]
+            # Normal case: stop at the next label
-            if next_page_nb == page_nb:
+            _, end_page, _, y_end_raw = coords_list[idx + 1]
-                # Same page
+            end_y_target_raw = y_end_raw
                y_end = scale_coord(next_ymax, doc[page_nb])
                temp_path = f"_part_{idx}_0.pdf"
                save_cropped_page(doc, page_nb, y_start, y_end, temp_path)
                temp_parts.append(temp_path)
        else:
-                # Current page part
+            # FIX BUG 1: Last label extends to the very end of the document
-                temp_path1 = f"_part_{idx}_0.pdf"
+            end_page = doc.page_count - 1
-                save_cropped_page(doc, page_nb, y_start, doc[page_nb].rect.height, temp_path1)
+            end_y_target_raw = 1000  # 1000 represents full height
                temp_parts.append(temp_path1)
-                # Next page part
+        # FIX BUG 2: Iterate through EVERY page from start to end
-                y_end_next = scale_coord(next_ymax, doc[next_page_nb])
+        # This handles cases where start_page == end_page, start_page + 1 == end_page,
-                temp_path2 = f"_part_{idx}_1.pdf"
+        # AND start_page + N == end_page (gaps)
-                if y_end_next >= 10:
+        current_p = start_page
-                    save_cropped_page(doc, next_page_nb, 0, y_end_next, temp_path2)
+        while current_p <= end_page:
-                    temp_parts.append(temp_path2)
+
            # Determine Top Cut (y0)
            if current_p == start_page:
                y0 = scale_coord(y_start_raw, doc[current_p])
            else:
-            # Last segment to end of page
+                y0 = 0 # Start from top of page for intermediate/last pages
-            temp_path = f"_part_{idx}_0.pdf"
+
-            save_cropped_page(doc, page_nb, y_start, doc[page_nb].rect.height, temp_path)
+            # Determine Bottom Cut (y1)
            if current_p == end_page:
                y1 = scale_coord(end_y_target_raw, doc[current_p])
            else:
                y1 = doc[current_p].rect.height # Go to bottom of intermediate pages
            # Only save if the slice has height (avoid empty files)
            if y1 > y0 + 1:
                temp_path = f"_part_{idx}_{current_p}.pdf"
                save_cropped_page(doc, current_p, y0, y1, temp_path)
                temp_parts.append(temp_path)
-        # Collect parts for this label instead of writing immediately
+            current_p += 1
        parts_by_label[title].extend(temp_parts)
    output_dir.mkdir(parents=True, exist_ok=True)
@ -148,6 +125,7 @@ def split_an_interro(base_dir,input_pdf, coords_list):
    for title, parts in parts_by_label.items():
        merger = PdfWriter()
        for part in parts:
            if os.path.exists(part):
                merger.append(part)
        filename = f"{title}.pdf"
@ -155,19 +133,19 @@ def split_an_interro(base_dir,input_pdf, coords_list):
        merger.close()
        generated_files.add(filename)
-        # Cleanup temporary files for this label
+        # Cleanup
        for part in parts:
            if os.path.exists(part):
                os.remove(part)
    doc.close()
-    # --- Cleanup Logic ---
+
    # Move files not generated in this run to 'Missing' folder
    if output_dir.exists():
        missing_dir = output_dir / "Missing"
        for item in output_dir.iterdir():
            if item.is_file() and item.name not in generated_files:
-                print(f"ALERT: File '{item.name}' in '{input_pdf.stem}' was not generated. Moving to {missing_dir}")
+                print(f"ALERT: File '{item.name}' not generated. Moving to {missing_dir}")
                missing_dir.mkdir(exist_ok=True)
                item.rename(missing_dir / item.name)
@ -180,12 +158,9 @@ if __name__ == "__main__":
    input_arg = Path(sys.argv[1])
    if input_arg.is_file():
        # If a single file is provided, process only that file.
        # base_dir is assumed to be the directory containing the file.
        base_dir = input_arg.parent
        pdf_files = [input_arg]
    elif input_arg.is_dir():
        # If a directory is provided, process all PDFs inside.
        base_dir = input_arg
        pdf_files = sorted(base_dir.glob("*.pdf"))
    else:
@ -199,4 +174,4 @@ if __name__ == "__main__":
                print("Decoded name : ", name)
                split_an_interro(base_dir, pdf_path, coords)
            else:
-                print(f"Warning: No JSON found for {pdf_path.name} at {json_path}")
+                print(f"Warning: No JSON found for {pdf_path.name}")