Fix missing pages
parent
a682e21e8b
commit
6215b2aad2
115
splitting_int.py
115
splitting_int.py
|
|
@ -6,17 +6,15 @@ import sys
|
||||||
import json
|
import json
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import defaultdict # Added for grouping
|
from collections import defaultdict
|
||||||
|
|
||||||
# input_pdf = "Une Interro/Split.pdf"
|
# input_pdf = "Une Interro/Split.pdf"
|
||||||
|
|
||||||
def decode_json(pdf_file):
|
def decode_json(pdf_file):
|
||||||
file_path = Path(pdf_file)
|
file_path = Path(pdf_file)
|
||||||
# Load JSON content from File.json
|
|
||||||
with open(file_path.with_suffix(".json"), "r") as f:
|
with open(file_path.with_suffix(".json"), "r") as f:
|
||||||
json_result = json.load(f)
|
json_result = json.load(f)
|
||||||
|
|
||||||
# Get number of pages from File.pdf
|
|
||||||
nb_pages = len(PdfReader(file_path).pages)
|
nb_pages = len(PdfReader(file_path).pages)
|
||||||
|
|
||||||
bb_list = json_result["list"]
|
bb_list = json_result["list"]
|
||||||
|
|
@ -26,15 +24,11 @@ def decode_json(pdf_file):
|
||||||
def page_number(b):
|
def page_number(b):
|
||||||
return ((b[1] + b[3]) // 2) // column_width
|
return ((b[1] + b[3]) // 2) // column_width
|
||||||
|
|
||||||
result = [] # contient la page, et, en millième, au dessus du label
|
result = []
|
||||||
# (marge en plus), et au dessus du label (marge en moins)
|
|
||||||
for d in bb_list:
|
for d in bb_list:
|
||||||
(b, label) = d["box_2d"], d["label"]
|
(b, label) = d["box_2d"], d["label"]
|
||||||
# print(b)
|
|
||||||
pn = page_number(b)
|
pn = page_number(b)
|
||||||
# 38 carreaux dans une page
|
|
||||||
carreau = 1000 // 38
|
carreau = 1000 // 38
|
||||||
# result.append((label, pn, b[2] - 3 * carreau, b[2] + int(carreau)))
|
|
||||||
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau)))
|
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau)))
|
||||||
result.sort(key=lambda x: (x[1], x[2]))
|
result.sort(key=lambda x: (x[1], x[2]))
|
||||||
return (name, result)
|
return (name, result)
|
||||||
|
|
@ -45,18 +39,13 @@ def split_an_interro(base_dir,input_pdf, coords_list):
|
||||||
|
|
||||||
output_dir = base_dir / input_pdf.stem
|
output_dir = base_dir / input_pdf.stem
|
||||||
generated_files = set()
|
generated_files = set()
|
||||||
|
|
||||||
# Dictionary to collect parts for each label
|
|
||||||
parts_by_label = defaultdict(list)
|
parts_by_label = defaultdict(list)
|
||||||
|
|
||||||
# Filter coords_list to remove consecutive duplicate labels.
|
# Filter consecutive duplicate labels
|
||||||
# If a label appears at the end of a page and again at the start of the next,
|
|
||||||
# we want to treat it as one continuous block, not two separate cuts.
|
|
||||||
filtered_coords = []
|
filtered_coords = []
|
||||||
if coords_list:
|
if coords_list:
|
||||||
filtered_coords.append(coords_list[0])
|
filtered_coords.append(coords_list[0])
|
||||||
for item in coords_list[1:]:
|
for item in coords_list[1:]:
|
||||||
# item[0] is the label/title
|
|
||||||
if item[0] != filtered_coords[-1][0]:
|
if item[0] != filtered_coords[-1][0]:
|
||||||
filtered_coords.append(item)
|
filtered_coords.append(item)
|
||||||
coords_list = filtered_coords
|
coords_list = filtered_coords
|
||||||
|
|
@ -67,79 +56,67 @@ def split_an_interro(base_dir,input_pdf, coords_list):
|
||||||
return (y / 1000) * page_height
|
return (y / 1000) * page_height
|
||||||
|
|
||||||
def save_cropped_page(doc, page_num, y0, y1, out_path):
|
def save_cropped_page(doc, page_num, y0, y1, out_path):
|
||||||
"""
|
"""Saves a cropped portion of a page as a new PDF."""
|
||||||
Saves a cropped portion of a page as a new PDF,
|
|
||||||
correctly handling the original page's rotation.
|
|
||||||
"""
|
|
||||||
# print(f"Saving cropped_page with : {y0} and {y1}")
|
|
||||||
# Get the source page object
|
|
||||||
page = doc[page_num]
|
page = doc[page_num]
|
||||||
# print("Debug : ", page_num, y0, y1, output_dir)
|
|
||||||
# 1. Define the crop rectangle in the VISUAL (rotated) coordinate system.
|
|
||||||
# The page.rect gives unrotated dimensions, so we apply the transformation
|
|
||||||
# matrix to get the visual dimensions.
|
|
||||||
rotated_rect = page.rect * page.transformation_matrix
|
rotated_rect = page.rect * page.transformation_matrix
|
||||||
visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1)
|
visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1)
|
||||||
|
|
||||||
# 2. Transform this visual crop rectangle back into the UNROTATED system.
|
|
||||||
# The 'clip' argument for show_pdf_page requires unrotated coordinates.
|
|
||||||
# The derotation_matrix does this conversion for us.
|
|
||||||
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
|
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
|
||||||
|
|
||||||
# Create a new temporary document for the output
|
|
||||||
temp_doc = fitz.open()
|
temp_doc = fitz.open()
|
||||||
|
|
||||||
# Create a new page with the dimensions of our visual crop
|
|
||||||
temp_page = temp_doc.new_page(
|
temp_page = temp_doc.new_page(
|
||||||
width=visual_crop_rect.width,
|
width=visual_crop_rect.width,
|
||||||
height=visual_crop_rect.height
|
height=visual_crop_rect.height
|
||||||
)
|
)
|
||||||
|
|
||||||
# Display the cropped and de-rotated content on the new page
|
|
||||||
temp_page.show_pdf_page(
|
temp_page.show_pdf_page(
|
||||||
temp_page.rect, # Where to place the content on the new page (the whole page)
|
temp_page.rect,
|
||||||
doc, # Source document
|
doc,
|
||||||
page_num,
|
page_num,
|
||||||
rotate=-page.rotation, # Cancel the original page's rotation
|
rotate=-page.rotation,
|
||||||
clip=unrotated_clip_rect # The area to take from the source page
|
clip=unrotated_clip_rect
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save the new one-page PDF and close the document
|
|
||||||
temp_doc.save(out_path)
|
temp_doc.save(out_path)
|
||||||
temp_doc.close()
|
temp_doc.close()
|
||||||
|
|
||||||
|
# Iterate through all labels
|
||||||
for idx, (title, page_nb, ymin, _) in enumerate(coords_list):
|
for idx, (title, start_page, y_start_raw, _) in enumerate(coords_list):
|
||||||
temp_parts = []
|
temp_parts = []
|
||||||
y_start = scale_coord(ymin, doc[page_nb])
|
|
||||||
|
|
||||||
|
# Determine the stopping point for this label
|
||||||
if idx + 1 < len(coords_list):
|
if idx + 1 < len(coords_list):
|
||||||
_, next_page_nb, _, next_ymax = coords_list[idx + 1]
|
# Normal case: stop at the next label
|
||||||
if next_page_nb == page_nb:
|
_, end_page, _, y_end_raw = coords_list[idx + 1]
|
||||||
# Same page
|
end_y_target_raw = y_end_raw
|
||||||
y_end = scale_coord(next_ymax, doc[page_nb])
|
|
||||||
temp_path = f"_part_{idx}_0.pdf"
|
|
||||||
save_cropped_page(doc, page_nb, y_start, y_end, temp_path)
|
|
||||||
temp_parts.append(temp_path)
|
|
||||||
else:
|
else:
|
||||||
# Current page part
|
# FIX BUG 1: Last label extends to the very end of the document
|
||||||
temp_path1 = f"_part_{idx}_0.pdf"
|
end_page = doc.page_count - 1
|
||||||
save_cropped_page(doc, page_nb, y_start, doc[page_nb].rect.height, temp_path1)
|
end_y_target_raw = 1000 # 1000 represents full height
|
||||||
temp_parts.append(temp_path1)
|
|
||||||
|
|
||||||
# Next page part
|
# FIX BUG 2: Iterate through EVERY page from start to end
|
||||||
y_end_next = scale_coord(next_ymax, doc[next_page_nb])
|
# This handles cases where start_page == end_page, start_page + 1 == end_page,
|
||||||
temp_path2 = f"_part_{idx}_1.pdf"
|
# AND start_page + N == end_page (gaps)
|
||||||
if y_end_next >= 10:
|
current_p = start_page
|
||||||
save_cropped_page(doc, next_page_nb, 0, y_end_next, temp_path2)
|
while current_p <= end_page:
|
||||||
temp_parts.append(temp_path2)
|
|
||||||
|
# Determine Top Cut (y0)
|
||||||
|
if current_p == start_page:
|
||||||
|
y0 = scale_coord(y_start_raw, doc[current_p])
|
||||||
else:
|
else:
|
||||||
# Last segment to end of page
|
y0 = 0 # Start from top of page for intermediate/last pages
|
||||||
temp_path = f"_part_{idx}_0.pdf"
|
|
||||||
save_cropped_page(doc, page_nb, y_start, doc[page_nb].rect.height, temp_path)
|
# Determine Bottom Cut (y1)
|
||||||
|
if current_p == end_page:
|
||||||
|
y1 = scale_coord(end_y_target_raw, doc[current_p])
|
||||||
|
else:
|
||||||
|
y1 = doc[current_p].rect.height # Go to bottom of intermediate pages
|
||||||
|
|
||||||
|
# Only save if the slice has height (avoid empty files)
|
||||||
|
if y1 > y0 + 1:
|
||||||
|
temp_path = f"_part_{idx}_{current_p}.pdf"
|
||||||
|
save_cropped_page(doc, current_p, y0, y1, temp_path)
|
||||||
temp_parts.append(temp_path)
|
temp_parts.append(temp_path)
|
||||||
|
|
||||||
# Collect parts for this label instead of writing immediately
|
current_p += 1
|
||||||
|
|
||||||
parts_by_label[title].extend(temp_parts)
|
parts_by_label[title].extend(temp_parts)
|
||||||
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
@ -148,6 +125,7 @@ def split_an_interro(base_dir,input_pdf, coords_list):
|
||||||
for title, parts in parts_by_label.items():
|
for title, parts in parts_by_label.items():
|
||||||
merger = PdfWriter()
|
merger = PdfWriter()
|
||||||
for part in parts:
|
for part in parts:
|
||||||
|
if os.path.exists(part):
|
||||||
merger.append(part)
|
merger.append(part)
|
||||||
|
|
||||||
filename = f"{title}.pdf"
|
filename = f"{title}.pdf"
|
||||||
|
|
@ -155,19 +133,19 @@ def split_an_interro(base_dir,input_pdf, coords_list):
|
||||||
merger.close()
|
merger.close()
|
||||||
generated_files.add(filename)
|
generated_files.add(filename)
|
||||||
|
|
||||||
# Cleanup temporary files for this label
|
# Cleanup
|
||||||
for part in parts:
|
for part in parts:
|
||||||
if os.path.exists(part):
|
if os.path.exists(part):
|
||||||
os.remove(part)
|
os.remove(part)
|
||||||
|
|
||||||
doc.close()
|
doc.close()
|
||||||
# --- Cleanup Logic ---
|
|
||||||
# Move files not generated in this run to 'Missing' folder
|
# Move files not generated in this run to 'Missing' folder
|
||||||
if output_dir.exists():
|
if output_dir.exists():
|
||||||
missing_dir = output_dir / "Missing"
|
missing_dir = output_dir / "Missing"
|
||||||
for item in output_dir.iterdir():
|
for item in output_dir.iterdir():
|
||||||
if item.is_file() and item.name not in generated_files:
|
if item.is_file() and item.name not in generated_files:
|
||||||
print(f"ALERT: File '{item.name}' in '{input_pdf.stem}' was not generated. Moving to {missing_dir}")
|
print(f"ALERT: File '{item.name}' not generated. Moving to {missing_dir}")
|
||||||
missing_dir.mkdir(exist_ok=True)
|
missing_dir.mkdir(exist_ok=True)
|
||||||
item.rename(missing_dir / item.name)
|
item.rename(missing_dir / item.name)
|
||||||
|
|
||||||
|
|
@ -180,12 +158,9 @@ if __name__ == "__main__":
|
||||||
input_arg = Path(sys.argv[1])
|
input_arg = Path(sys.argv[1])
|
||||||
|
|
||||||
if input_arg.is_file():
|
if input_arg.is_file():
|
||||||
# If a single file is provided, process only that file.
|
|
||||||
# base_dir is assumed to be the directory containing the file.
|
|
||||||
base_dir = input_arg.parent
|
base_dir = input_arg.parent
|
||||||
pdf_files = [input_arg]
|
pdf_files = [input_arg]
|
||||||
elif input_arg.is_dir():
|
elif input_arg.is_dir():
|
||||||
# If a directory is provided, process all PDFs inside.
|
|
||||||
base_dir = input_arg
|
base_dir = input_arg
|
||||||
pdf_files = sorted(base_dir.glob("*.pdf"))
|
pdf_files = sorted(base_dir.glob("*.pdf"))
|
||||||
else:
|
else:
|
||||||
|
|
@ -199,4 +174,4 @@ if __name__ == "__main__":
|
||||||
print("Decoded name : ", name)
|
print("Decoded name : ", name)
|
||||||
split_an_interro(base_dir, pdf_path, coords)
|
split_an_interro(base_dir, pdf_path, coords)
|
||||||
else:
|
else:
|
||||||
print(f"Warning: No JSON found for {pdf_path.name} at {json_path}")
|
print(f"Warning: No JSON found for {pdf_path.name}")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue