Copies/splitting_int.py

176 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import fitz # PyMuPDF
from pypdf import PdfWriter
from pypdf import PdfReader
import os
import sys
import json
import shutil
from pathlib import Path
from collections import defaultdict
def decode_json(pdf_file):
file_path = Path(pdf_file)
with open(file_path.with_suffix(".json"), "r") as f:
json_result = json.load(f)
nb_pages = len(PdfReader(file_path).pages)
bb_list = json_result["list"]
name = json_result["name"]
column_width = 1000 // nb_pages
def page_number(b):
return ((b[1] + b[3]) // 2) // column_width
result = []
for d in bb_list:
(b, label) = d["box_2d"], d["label"]
pn = page_number(b)
carreau = 1000 // 38
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau)))
result.sort(key=lambda x: (x[1], x[2]))
return (name, result)
def split_an_interro(base_dir, input_pdf, coords_list):
doc = fitz.open(input_pdf)
output_dir = base_dir / input_pdf.stem
generated_files = set()
parts_by_label = defaultdict(list)
# Filter consecutive duplicate labels
filtered_coords = []
if coords_list:
filtered_coords.append(coords_list[0])
for item in coords_list[1:]:
if item[0] != filtered_coords[-1][0]:
filtered_coords.append(item)
coords_list = filtered_coords
def scale_coord(y, page):
"""Scale y from 01000 range to PDF points."""
page_height = page.rect.height
return (y / 1000) * page_height
def save_cropped_page(doc, page_num, y0, y1, out_path):
"""Saves a cropped portion of a page as a new PDF."""
page = doc[page_num]
rotated_rect = page.rect * page.transformation_matrix
visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1)
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
temp_doc = fitz.open()
temp_page = temp_doc.new_page(
width=visual_crop_rect.width,
height=visual_crop_rect.height
)
temp_page.show_pdf_page(
temp_page.rect,
doc,
page_num,
rotate=-page.rotation,
clip=unrotated_clip_rect
)
temp_doc.save(out_path)
temp_doc.close()
# Iterate through all labels
for idx, (title, start_page, y_start_raw, _) in enumerate(coords_list):
temp_parts = []
# Determine the stopping point for this label
if idx + 1 < len(coords_list):
# Normal case: stop at the next label
_, end_page, _, y_end_raw = coords_list[idx + 1]
end_y_target_raw = y_end_raw
else:
# FIX BUG 1: Last label extends to the very end of the document
end_page = doc.page_count - 1
end_y_target_raw = 1000 # 1000 represents full height
# FIX BUG 2: Iterate through EVERY page from start to end
# This handles cases where start_page == end_page, start_page + 1 == end_page,
# AND start_page + N == end_page (gaps)
current_p = start_page
while current_p <= end_page:
# Determine Top Cut (y0)
if current_p == start_page:
y0 = scale_coord(y_start_raw, doc[current_p])
else:
y0 = 0 # Start from top of page for intermediate/last pages
# Determine Bottom Cut (y1)
if current_p == end_page:
y1 = scale_coord(end_y_target_raw, doc[current_p])
else:
y1 = doc[current_p].rect.height # Go to bottom of intermediate pages
# Only save if the slice has height (avoid empty files)
if y1 > y0 + 1:
temp_path = f"_part_{idx}_{current_p}.pdf"
save_cropped_page(doc, current_p, y0, y1, temp_path)
temp_parts.append(temp_path)
current_p += 1
parts_by_label[title].extend(temp_parts)
output_dir.mkdir(parents=True, exist_ok=True)
# Process aggregated parts by label
for title, parts in parts_by_label.items():
merger = PdfWriter()
for part in parts:
if os.path.exists(part):
merger.append(part)
filename = f"{title}.pdf"
merger.write(output_dir / filename)
merger.close()
generated_files.add(filename)
# Cleanup
for part in parts:
if os.path.exists(part):
os.remove(part)
doc.close()
# Move files not generated in this run to 'Missing' folder
if output_dir.exists():
missing_dir = output_dir / "Missing"
for item in output_dir.iterdir():
if item.is_file() and item.name not in generated_files:
print(f"ALERT: File '{item.name}' not generated. Moving to {missing_dir}")
missing_dir.mkdir(exist_ok=True)
item.rename(missing_dir / item.name)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python scrit.py <directory or pdf_file>")
sys.exit(1)
input_arg = Path(sys.argv[1])
if input_arg.is_file():
base_dir = input_arg.parent
pdf_files = [input_arg]
elif input_arg.is_dir():
base_dir = input_arg
pdf_files = sorted(base_dir.glob("*.pdf"))
else:
print(f"Error: {input_arg} is not a valid file or directory.")
sys.exit(1)
for pdf_path in pdf_files:
json_path = pdf_path.with_suffix(".json")
if json_path.exists():
(name, coords) = decode_json(pdf_path)
print("Decoded name : ", name)
split_an_interro(base_dir, pdf_path, coords)
else:
print(f"Warning: No JSON found for {pdf_path.name}")