177 lines
5.7 KiB
Python
177 lines
5.7 KiB
Python
import fitz # PyMuPDF
|
||
from pypdf import PdfWriter
|
||
from pypdf import PdfReader
|
||
import os
|
||
import sys
|
||
import json
|
||
import shutil
|
||
from pathlib import Path
|
||
from collections import defaultdict
|
||
|
||
def decode_json(pdf_file):
|
||
file_path = Path(pdf_file)
|
||
with open(file_path.with_suffix(".json"), "r") as f:
|
||
json_result = json.load(f)
|
||
|
||
nb_pages = len(PdfReader(file_path).pages)
|
||
|
||
bb_list = json_result["list"]
|
||
name = json_result["name"]
|
||
column_width = 1000 // nb_pages
|
||
|
||
def page_number(b):
|
||
return ((b[1] + b[3]) // 2) // column_width
|
||
|
||
result = []
|
||
for d in bb_list:
|
||
(b, label) = d["box_2d"], d["label"]
|
||
pn = page_number(b)
|
||
carreau = 1000 // 38
|
||
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau)))
|
||
result.sort(key=lambda x: (x[1], x[2]))
|
||
return (name, result)
|
||
|
||
|
||
def split_an_interro(base_dir, input_pdf, coords_list):
|
||
doc = fitz.open(input_pdf)
|
||
|
||
output_dir = base_dir / input_pdf.stem
|
||
generated_files = set()
|
||
parts_by_label = defaultdict(list)
|
||
|
||
# Filter consecutive duplicate labels
|
||
filtered_coords = []
|
||
if coords_list:
|
||
filtered_coords.append(coords_list[0])
|
||
for item in coords_list[1:]:
|
||
if item[0] != filtered_coords[-1][0]:
|
||
filtered_coords.append(item)
|
||
coords_list = filtered_coords
|
||
|
||
def scale_coord(y, page):
|
||
"""Scale y from 0–1000 range to PDF points."""
|
||
page_height = page.rect.height
|
||
return (y / 1000) * page_height
|
||
|
||
def save_cropped_page(doc, page_num, y0, y1, out_path):
|
||
"""Saves a cropped portion of a page as a new PDF."""
|
||
page = doc[page_num]
|
||
rotated_rect = page.rect * page.transformation_matrix
|
||
visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1)
|
||
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
|
||
|
||
temp_doc = fitz.open()
|
||
temp_page = temp_doc.new_page(
|
||
width=visual_crop_rect.width,
|
||
height=visual_crop_rect.height
|
||
)
|
||
temp_page.show_pdf_page(
|
||
temp_page.rect,
|
||
doc,
|
||
page_num,
|
||
rotate=-page.rotation,
|
||
clip=unrotated_clip_rect
|
||
)
|
||
temp_doc.save(out_path)
|
||
temp_doc.close()
|
||
|
||
# Iterate through all labels
|
||
for idx, (title, start_page, y_start_raw, _) in enumerate(coords_list):
|
||
temp_parts = []
|
||
|
||
# Determine the stopping point for this label
|
||
if idx + 1 < len(coords_list):
|
||
# Normal case: stop at the next label
|
||
_, end_page, _, y_end_raw = coords_list[idx + 1]
|
||
end_y_target_raw = y_end_raw
|
||
else:
|
||
# FIX BUG 1: Last label extends to the very end of the document
|
||
end_page = doc.page_count - 1
|
||
end_y_target_raw = 1000 # 1000 represents full height
|
||
|
||
# FIX BUG 2: Iterate through EVERY page from start to end
|
||
# This handles cases where start_page == end_page, start_page + 1 == end_page,
|
||
# AND start_page + N == end_page (gaps)
|
||
current_p = start_page
|
||
while current_p <= end_page:
|
||
|
||
# Determine Top Cut (y0)
|
||
if current_p == start_page:
|
||
y0 = scale_coord(y_start_raw, doc[current_p])
|
||
else:
|
||
y0 = 0 # Start from top of page for intermediate/last pages
|
||
|
||
# Determine Bottom Cut (y1)
|
||
if current_p == end_page:
|
||
y1 = scale_coord(end_y_target_raw, doc[current_p])
|
||
else:
|
||
y1 = doc[current_p].rect.height # Go to bottom of intermediate pages
|
||
|
||
# Only save if the slice has height (avoid empty files)
|
||
if y1 > y0 + 1:
|
||
temp_path = f"_part_{idx}_{current_p}.pdf"
|
||
save_cropped_page(doc, current_p, y0, y1, temp_path)
|
||
temp_parts.append(temp_path)
|
||
|
||
current_p += 1
|
||
|
||
parts_by_label[title].extend(temp_parts)
|
||
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Process aggregated parts by label
|
||
for title, parts in parts_by_label.items():
|
||
merger = PdfWriter()
|
||
for part in parts:
|
||
if os.path.exists(part):
|
||
merger.append(part)
|
||
|
||
filename = f"{title}.pdf"
|
||
merger.write(output_dir / filename)
|
||
merger.close()
|
||
generated_files.add(filename)
|
||
|
||
# Cleanup
|
||
for part in parts:
|
||
if os.path.exists(part):
|
||
os.remove(part)
|
||
|
||
doc.close()
|
||
|
||
# Move files not generated in this run to 'Missing' folder
|
||
if output_dir.exists():
|
||
missing_dir = output_dir / "Missing"
|
||
for item in output_dir.iterdir():
|
||
if item.is_file() and item.name not in generated_files:
|
||
print(f"ALERT: File '{item.name}' not generated. Moving to {missing_dir}")
|
||
missing_dir.mkdir(exist_ok=True)
|
||
item.rename(missing_dir / item.name)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
if len(sys.argv) < 2:
|
||
print("Usage: python script.py <directory or pdf_file>")
|
||
sys.exit(1)
|
||
|
||
input_arg = Path(sys.argv[1])
|
||
|
||
if input_arg.is_file():
|
||
base_dir = input_arg.parent
|
||
pdf_files = [input_arg]
|
||
elif input_arg.is_dir():
|
||
base_dir = input_arg
|
||
pdf_files = sorted(base_dir.glob("*.pdf"))
|
||
else:
|
||
print(f"Error: {input_arg} is not a valid file or directory.")
|
||
sys.exit(1)
|
||
|
||
for pdf_path in pdf_files:
|
||
json_path = pdf_path.with_suffix(".json")
|
||
# print("Debug :", json_path)
|
||
if json_path.exists():
|
||
(name, coords) = decode_json(pdf_path)
|
||
print("Decoded name : ", name)
|
||
split_an_interro(base_dir, pdf_path, coords)
|
||
else:
|
||
print(f"Warning: No JSON found for {pdf_path.name}")
|