Copies/splitting_int.py

217 lines
7.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import fitz # PyMuPDF
from pypdf import PdfWriter
from pypdf import PdfReader
import os
import sys
import json
import shutil
from pathlib import Path
from collections import defaultdict
carreau = 1000 // 38
def decode_json(pdf_file):
file_path = Path(pdf_file)
with open(file_path.with_suffix(".json"), "r") as f:
json_result = json.load(f)
nb_pages = len(PdfReader(file_path).pages)
bb_list = json_result["list"]
name = json_result["name"]
column_width = 1000 // nb_pages
def page_number(b):
return ((b[1] + b[3]) // 2) // column_width
result = []
for d in bb_list:
(b, label) = d["box_2d"], d["label"]
pn = page_number(b)
result.append((label, pn, b[0] - carreau, b[2]-carreau, b[1], b[3]))
result.sort(key=lambda x: (x[1], x[2]))
return (name, result)
def split_an_interro(base_dir, input_pdf, coords_list):
doc = fitz.open(input_pdf)
output_dir = base_dir / input_pdf.stem
generated_files = set()
parts_by_label = defaultdict(list)
# 1. Parse labels to strip '|' and determine type: L (Left), R (Right), N (Normal)
parsed_coords = []
for item in coords_list:
label, pn, y0, y1, x0, x1 = item
if label.startswith("|"):
c_type, clean_label = "L", label[1:]
elif label.endswith("|"):
c_type, clean_label = "R", label[:-1]
else:
c_type, clean_label = "N", label
parsed_coords.append((clean_label, c_type, pn, y0, y1, x0, x1))
# 2. Filter consecutive duplicate labels based on the cleaned name
filtered_coords = []
if parsed_coords:
filtered_coords.append(parsed_coords[0])
for item in parsed_coords[1:]:
if item[0] != filtered_coords[-1][0]:
filtered_coords.append(item)
coords_list = filtered_coords
def scale_coord(y, page):
"""Scale y from 01000 range to PDF points."""
page_height = page.rect.height
return (y / 1000) * page_height
def save_cropped_page(doc, page_num, x0, y0, x1, y1, out_path):
"""Saves a cropped portion of a page as a new PDF."""
page = doc[page_num]
rotated_rect = page.rect * page.transformation_matrix
visual_crop_rect = fitz.Rect(rotated_rect.x0 + x0, y0, rotated_rect.x0 + x1, y1)
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
temp_doc = fitz.open()
temp_page = temp_doc.new_page(
width=visual_crop_rect.width,
height=visual_crop_rect.height
)
temp_page.show_pdf_page(
temp_page.rect,
doc,
page_num,
rotate=-page.rotation,
clip=unrotated_clip_rect
)
temp_doc.save(out_path)
temp_doc.close()
# Iterate through all labels
for idx, (clean_label, c_type, start_page, y_start_raw, y_end_box, x0_raw, x1_raw) in enumerate(coords_list):
if clean_label == "_":
continue
temp_parts = []
end_page = doc.page_count - 1
end_y_target_raw = 1000
# RULE 2: Determine stopping label
for next_item in coords_list[idx + 1:]:
n_clean, n_type, n_pn, n_y_start, n_y_end, _, _ = next_item
if c_type == "L":
is_stop = (n_type in ("L", "N"))
elif c_type == "R":
is_stop = (n_type in ("R", "N"))
else:
is_stop = True # Normal labels stop at anything
if is_stop:
end_page = n_pn
# end_y_target_raw = n_y_start
# On avait retiré un carreau précédemment, on le rajoute…
end_y_target_raw = min(n_y_start + int(1.25 * carreau), 1000)
break
# RULES 3 & 4: Calculate horizontal boundaries (0.0 to 1.0 fraction of local page width)
col_w = 1000 / doc.page_count
if c_type == "L": # |name
fraction_x0 = (x0_raw % col_w) / col_w
fraction_x1 = 1.0
end_y_target_raw = min(1000, end_y_target_raw + 40)
elif c_type == "R": # name|
fraction_x0 = 0.0
# Find the closest 'L' label in y-distance
L_labels = [it for it in parsed_coords if it[1] == "L"]
if L_labels:
closest_L = min(L_labels, key=lambda it: abs(it[3] - y_start_raw))
closest_L_x_center = (closest_L[5] + closest_L[6]) / 2.0
fraction_x1 = (closest_L_x_center % col_w) / col_w
if fraction_x1 <= fraction_x0: fraction_x1 = 1.0 # Fallback
else:
fraction_x1 = 1.0
else: # Normal
fraction_x0 = 0.0
fraction_x1 = 1.0
current_p = start_page
while current_p <= end_page:
page = doc[current_p]
y0 = scale_coord(y_start_raw, page) if current_p == start_page else 0
y1 = scale_coord(end_y_target_raw, page) if current_p == end_page else page.rect.height
if y1 > y0 + 1:
# Convert fractions to absolute PDF points
x0_pdf = fraction_x0 * page.rect.width
x1_pdf = fraction_x1 * page.rect.width
temp_path = f"_part_{idx}_{current_p}.pdf"
save_cropped_page(doc, current_p, x0_pdf, y0, x1_pdf, y1, temp_path)
temp_parts.append(temp_path)
current_p += 1
parts_by_label[clean_label].extend(temp_parts)
output_dir.mkdir(parents=True, exist_ok=True)
# Process aggregated parts by label
for title, parts in parts_by_label.items():
merger = PdfWriter()
for part in parts:
if os.path.exists(part):
merger.append(part)
filename = f"{title}.pdf"
merger.write(output_dir / filename)
merger.close()
generated_files.add(filename)
# Cleanup
for part in parts:
if os.path.exists(part):
os.remove(part)
doc.close()
# Move files not generated in this run to 'Missing' folder
if output_dir.exists():
missing_dir = output_dir / "Missing"
for item in output_dir.iterdir():
if item.is_file() and item.name not in generated_files:
print(f"ALERT: File '{item.name}' not generated. Moving to {missing_dir}")
missing_dir.mkdir(exist_ok=True)
item.rename(missing_dir / item.name)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python script.py <directory or pdf_file>")
sys.exit(1)
input_arg = Path(sys.argv[1])
if input_arg.is_file():
base_dir = input_arg.parent
pdf_files = [input_arg]
elif input_arg.is_dir():
base_dir = input_arg
pdf_files = sorted(base_dir.glob("*.pdf"))
else:
print(f"Error: {input_arg} is not a valid file or directory.")
sys.exit(1)
for pdf_path in pdf_files:
json_path = pdf_path.with_suffix(".json")
# print("Debug :", json_path)
if json_path.exists():
(name, coords) = decode_json(pdf_path)
print("Decoded name : ", name)
split_an_interro(base_dir, pdf_path, coords)
else:
print(f"Warning: No JSON found for {pdf_path.name}")