Copies/splitting_int.py

203 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import fitz # PyMuPDF
from pypdf import PdfWriter
from pypdf import PdfReader
import os
import sys
import json
import shutil
from pathlib import Path
from collections import defaultdict # Added for grouping
# input_pdf = "Une Interro/Split.pdf"
def decode_json(pdf_file):
file_path = Path(pdf_file)
# Load JSON content from File.json
with open(file_path.with_suffix(".json"), "r") as f:
json_result = json.load(f)
# Get number of pages from File.pdf
nb_pages = len(PdfReader(file_path).pages)
bb_list = json_result["list"]
name = json_result["name"]
column_width = 1000 // nb_pages
def page_number(b):
return ((b[1] + b[3]) // 2) // column_width
result = [] # contient la page, et, en millième, au dessus du label
# (marge en plus), et au dessus du label (marge en moins)
for d in bb_list:
(b, label) = d["box_2d"], d["label"]
# print(b)
pn = page_number(b)
# 38 carreaux dans une page
carreau = 1000 // 38
# result.append((label, pn, b[2] - 3 * carreau, b[2] + int(carreau)))
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau)))
result.sort(key=lambda x: (x[1], x[2]))
return (name, result)
def split_an_interro(base_dir,input_pdf, coords_list):
doc = fitz.open(input_pdf)
output_dir = base_dir / input_pdf.stem
generated_files = set()
# Dictionary to collect parts for each label
parts_by_label = defaultdict(list)
# Filter coords_list to remove consecutive duplicate labels.
# If a label appears at the end of a page and again at the start of the next,
# we want to treat it as one continuous block, not two separate cuts.
filtered_coords = []
if coords_list:
filtered_coords.append(coords_list[0])
for item in coords_list[1:]:
# item[0] is the label/title
if item[0] != filtered_coords[-1][0]:
filtered_coords.append(item)
coords_list = filtered_coords
def scale_coord(y, page):
"""Scale y from 01000 range to PDF points."""
page_height = page.rect.height
return (y / 1000) * page_height
def save_cropped_page(doc, page_num, y0, y1, out_path):
"""
Saves a cropped portion of a page as a new PDF,
correctly handling the original page's rotation.
"""
# print(f"Saving cropped_page with : {y0} and {y1}")
# Get the source page object
page = doc[page_num]
# print("Debug : ", page_num, y0, y1, output_dir)
# 1. Define the crop rectangle in the VISUAL (rotated) coordinate system.
# The page.rect gives unrotated dimensions, so we apply the transformation
# matrix to get the visual dimensions.
rotated_rect = page.rect * page.transformation_matrix
visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1)
# 2. Transform this visual crop rectangle back into the UNROTATED system.
# The 'clip' argument for show_pdf_page requires unrotated coordinates.
# The derotation_matrix does this conversion for us.
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
# Create a new temporary document for the output
temp_doc = fitz.open()
# Create a new page with the dimensions of our visual crop
temp_page = temp_doc.new_page(
width=visual_crop_rect.width,
height=visual_crop_rect.height
)
# Display the cropped and de-rotated content on the new page
temp_page.show_pdf_page(
temp_page.rect, # Where to place the content on the new page (the whole page)
doc, # Source document
page_num,
rotate=-page.rotation, # Cancel the original page's rotation
clip=unrotated_clip_rect # The area to take from the source page
)
# Save the new one-page PDF and close the document
temp_doc.save(out_path)
temp_doc.close()
for idx, (title, page_nb, ymin, _) in enumerate(coords_list):
temp_parts = []
y_start = scale_coord(ymin, doc[page_nb])
if idx + 1 < len(coords_list):
_, next_page_nb, _, next_ymax = coords_list[idx + 1]
if next_page_nb == page_nb:
# Same page
y_end = scale_coord(next_ymax, doc[page_nb])
temp_path = f"_part_{idx}_0.pdf"
save_cropped_page(doc, page_nb, y_start, y_end, temp_path)
temp_parts.append(temp_path)
else:
# Current page part
temp_path1 = f"_part_{idx}_0.pdf"
save_cropped_page(doc, page_nb, y_start, doc[page_nb].rect.height, temp_path1)
temp_parts.append(temp_path1)
# Next page part
y_end_next = scale_coord(next_ymax, doc[next_page_nb])
temp_path2 = f"_part_{idx}_1.pdf"
if y_end_next >= 10:
save_cropped_page(doc, next_page_nb, 0, y_end_next, temp_path2)
temp_parts.append(temp_path2)
else:
# Last segment to end of page
temp_path = f"_part_{idx}_0.pdf"
save_cropped_page(doc, page_nb, y_start, doc[page_nb].rect.height, temp_path)
temp_parts.append(temp_path)
# Collect parts for this label instead of writing immediately
parts_by_label[title].extend(temp_parts)
output_dir.mkdir(parents=True, exist_ok=True)
# Process aggregated parts by label
for title, parts in parts_by_label.items():
merger = PdfWriter()
for part in parts:
merger.append(part)
filename = f"{title}.pdf"
merger.write(output_dir / filename)
merger.close()
generated_files.add(filename)
# Cleanup temporary files for this label
for part in parts:
if os.path.exists(part):
os.remove(part)
doc.close()
# --- Cleanup Logic ---
# Move files not generated in this run to 'Missing' folder
if output_dir.exists():
missing_dir = output_dir / "Missing"
for item in output_dir.iterdir():
if item.is_file() and item.name not in generated_files:
print(f"ALERT: File '{item.name}' in '{input_pdf.stem}' was not generated. Moving to {missing_dir}")
missing_dir.mkdir(exist_ok=True)
item.rename(missing_dir / item.name)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python scrit.py <directory or pdf_file>")
sys.exit(1)
input_arg = Path(sys.argv[1])
if input_arg.is_file():
# If a single file is provided, process only that file.
# base_dir is assumed to be the directory containing the file.
base_dir = input_arg.parent
pdf_files = [input_arg]
elif input_arg.is_dir():
# If a directory is provided, process all PDFs inside.
base_dir = input_arg
pdf_files = sorted(base_dir.glob("*.pdf"))
else:
print(f"Error: {input_arg} is not a valid file or directory.")
sys.exit(1)
for pdf_path in pdf_files:
json_path = pdf_path.with_suffix(".json")
if json_path.exists():
(name, coords) = decode_json(pdf_path)
print("Decoded name : ", name)
split_an_interro(base_dir, pdf_path, coords)
else:
print(f"Warning: No JSON found for {pdf_path.name} at {json_path}")