Compare commits
No commits in common. "a2f6f6eec0d2455b270e8a61cf890613c0e900cc" and "26d02b098774ff43f080b2bee05323d83dcd45a7" have entirely different histories.
a2f6f6eec0
...
26d02b0987
13
Readme.org
13
Readme.org
|
|
@ -1,7 +1,7 @@
|
||||||
#+title: Script
|
#+title: Script
|
||||||
#+author: Sébastien Miquel
|
#+author: Sébastien Miquel
|
||||||
#+date: 14-03-2026
|
#+date: 14-03-2026
|
||||||
# Time-stamp: <08-05-26 22:52>
|
# Time-stamp: <07-05-26 11:33>
|
||||||
#+OPTIONS:
|
#+OPTIONS:
|
||||||
|
|
||||||
* Quézaco
|
* Quézaco
|
||||||
|
|
@ -106,13 +106,7 @@ export GEMINI_API_KEY=…
|
||||||
labels des exercices/questions.
|
labels des exercices/questions.
|
||||||
|
|
||||||
Rerun on a single file with =python cutleft.py Interro/Copie01.pdf=
|
Rerun on a single file with =python cutleft.py Interro/Copie01.pdf=
|
||||||
|
5. =python enonce_info.py Interro= (gestion perso)
|
||||||
* Génération d'information sur l'énoncé
|
|
||||||
|
|
||||||
1. =python enonce_info.py Interro= (gestion perso)
|
|
||||||
OU
|
|
||||||
2. =python gemini_for_enonce.py Interro=
|
|
||||||
+ Nécessite =enonce.tex/org= et `correction.tex/org`
|
|
||||||
|
|
||||||
* Labelisation et regroupement
|
* Labelisation et regroupement
|
||||||
|
|
||||||
|
|
@ -130,7 +124,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
|
||||||
+ Quand un label est manquant, il est possible de cliquer sur
|
+ Quand un label est manquant, il est possible de cliquer sur
|
||||||
l'image, ce qui copie les coordonnées dans le presse papier
|
l'image, ce qui copie les coordonnées dans le presse papier
|
||||||
(sous linux…), puis on peut l'ajouter à la main.
|
(sous linux…), puis on peut l'ajouter à la main.
|
||||||
+ Utilisation de `_`, `|…` et `…|`
|
|
||||||
Pour modifier une seule copie :
|
Pour modifier une seule copie :
|
||||||
=python plotting.py Interro/Copie01.pdf=
|
=python plotting.py Interro/Copie01.pdf=
|
||||||
|
|
||||||
|
|
@ -157,7 +151,6 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
|
||||||
2. =python correction.py Interro --limit 240= OU
|
2. =python correction.py Interro --limit 240= OU
|
||||||
=python correction.py Interro/Ex\ 2/Group_1.jpg= OU
|
=python correction.py Interro/Ex\ 2/Group_1.jpg= OU
|
||||||
=python correction.py Interro --overwrite=
|
=python correction.py Interro --overwrite=
|
||||||
=python correction.py Interro --pro-by-label= (needs `labels_for_pro`)
|
|
||||||
|
|
||||||
Fais les requêtes de correction à Gemini.
|
Fais les requêtes de correction à Gemini.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -396,8 +396,6 @@ def render_score_text(label, score, error, width_px, fontsize=30,
|
||||||
|
|
||||||
return img
|
return img
|
||||||
|
|
||||||
A4_WIDTH_200DPI = 1654
|
|
||||||
TARGET_MIN_WIDTH = int(A4_WIDTH_200DPI * 0.9) # 1406 pixels
|
|
||||||
def compose_label_image(base_img, label, result, hmin,
|
def compose_label_image(base_img, label, result, hmin,
|
||||||
render_fn=render_real_latex_text,
|
render_fn=render_real_latex_text,
|
||||||
draw_callback=None,
|
draw_callback=None,
|
||||||
|
|
@ -417,17 +415,6 @@ def compose_label_image(base_img, label, result, hmin,
|
||||||
draw_callback: Optional function(type, draw_obj, position_dict, data_dict)
|
draw_callback: Optional function(type, draw_obj, position_dict, data_dict)
|
||||||
called when elements are placed. Used for checkboxes.
|
called when elements are placed. Used for checkboxes.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
left_pad = 0
|
|
||||||
if base_img.width < TARGET_MIN_WIDTH:
|
|
||||||
total_missing = TARGET_MIN_WIDTH - base_img.width
|
|
||||||
left_pad = min(total_missing, MARGIN_LEFT)
|
|
||||||
right_pad = total_missing - left_pad
|
|
||||||
|
|
||||||
new_base = Image.new("RGB", (TARGET_MIN_WIDTH, base_img.height), "white")
|
|
||||||
new_base.paste(base_img, (left_pad, 0))
|
|
||||||
base_img = new_base
|
|
||||||
|
|
||||||
score = result.get('score', 0)
|
score = result.get('score', 0)
|
||||||
error = result.get('error', "")
|
error = result.get('error', "")
|
||||||
feedbacks = result.get('feedback', [])
|
feedbacks = result.get('feedback', [])
|
||||||
|
|
@ -498,8 +485,8 @@ def compose_label_image(base_img, label, result, hmin,
|
||||||
|
|
||||||
target_ymin = (ymin - hmin) + image_offset_y
|
target_ymin = (ymin - hmin) + image_offset_y
|
||||||
target_ymax = (ymax - hmin) + image_offset_y
|
target_ymax = (ymax - hmin) + image_offset_y
|
||||||
target_xmin = xmin + MARGIN_LEFT + left_pad
|
target_xmin = xmin + MARGIN_LEFT
|
||||||
target_xmax = xmax + MARGIN_LEFT + left_pad
|
target_xmax = xmax + MARGIN_LEFT
|
||||||
|
|
||||||
# Draw Rectangle (if not suppressed)
|
# Draw Rectangle (if not suppressed)
|
||||||
if "norectangle" not in fb:
|
if "norectangle" not in fb:
|
||||||
|
|
|
||||||
|
|
@ -582,7 +582,7 @@ Here is a list of all possible labels. You need to answer with a list one of the
|
||||||
height = grouping.get_pdf_height(str(new_pdf_path))
|
height = grouping.get_pdf_height(str(new_pdf_path))
|
||||||
grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR)
|
grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR)
|
||||||
new_tasks.append((str(Path(INPUT_DIR) / add_label / f"Group_{idx+1}.jpg"),
|
new_tasks.append((str(Path(INPUT_DIR) / add_label / f"Group_{idx+1}.jpg"),
|
||||||
add_label, False))
|
add_label, False, f"{label}(->)"))
|
||||||
error += f"(->){add_label}"
|
error += f"(->){add_label}"
|
||||||
keep_error = True
|
keep_error = True
|
||||||
else:
|
else:
|
||||||
|
|
@ -603,6 +603,7 @@ def process_single_task(task_tuple, precomputed_response=None):
|
||||||
file_path = task_tuple[0]
|
file_path = task_tuple[0]
|
||||||
label = task_tuple[1]
|
label = task_tuple[1]
|
||||||
can_spawn_tasks = task_tuple[2] if len(task_tuple) > 2 else True
|
can_spawn_tasks = task_tuple[2] if len(task_tuple) > 2 else True
|
||||||
|
injected_error = task_tuple[3] if len(task_tuple) > 3 else ""
|
||||||
|
|
||||||
group_name = os.path.splitext(file_path)[0]
|
group_name = os.path.splitext(file_path)[0]
|
||||||
json_path = group_name + '.json'
|
json_path = group_name + '.json'
|
||||||
|
|
@ -648,6 +649,15 @@ def process_single_task(task_tuple, precomputed_response=None):
|
||||||
for p in json_data:
|
for p in json_data:
|
||||||
pid = p["id"]
|
pid = p["id"]
|
||||||
res = p["result"]
|
res = p["result"]
|
||||||
|
|
||||||
|
# Inject additional error if present
|
||||||
|
if injected_error:
|
||||||
|
if res["error"]:
|
||||||
|
res["error"] = f"{injected_error} {res['error']}"
|
||||||
|
else:
|
||||||
|
res["error"] = injected_error
|
||||||
|
|
||||||
|
|
||||||
yming, ymaxg, width_r = d_data[pid]
|
yming, ymaxg, width_r = d_data[pid]
|
||||||
|
|
||||||
pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{label}.pdf"
|
pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{label}.pdf"
|
||||||
|
|
@ -710,6 +720,8 @@ def process_single_task(task_tuple, precomputed_response=None):
|
||||||
tprint(f"Error decoding JSON for {file_path}", file=sys.stderr)
|
tprint(f"Error decoding JSON for {file_path}", file=sys.stderr)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Exception processing {file_path}: {e}"
|
error_msg = f"Exception processing {file_path}: {e}"
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc() # <--- Add this line to see the real crash
|
||||||
print(error_msg, file=sys.stderr)
|
print(error_msg, file=sys.stderr)
|
||||||
with io_lock:
|
with io_lock:
|
||||||
errors_summary.append((error_msg, file_path))
|
errors_summary.append((error_msg, file_path))
|
||||||
|
|
|
||||||
49
cutleft.py
49
cutleft.py
|
|
@ -26,8 +26,7 @@ if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'):
|
||||||
files = [os.path.basename(path_arg)]
|
files = [os.path.basename(path_arg)]
|
||||||
elif os.path.isdir(path_arg):
|
elif os.path.isdir(path_arg):
|
||||||
INPUT_DIR = path_arg
|
INPUT_DIR = path_arg
|
||||||
files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf') and
|
files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf')])
|
||||||
"nonc" not in f.lower()])
|
|
||||||
else:
|
else:
|
||||||
sys.exit("Error: Input must be a directory or a PDF file.")
|
sys.exit("Error: Input must be a directory or a PDF file.")
|
||||||
|
|
||||||
|
|
@ -84,20 +83,12 @@ def stitch_images(image_list):
|
||||||
|
|
||||||
return combined
|
return combined
|
||||||
|
|
||||||
import threading
|
|
||||||
pdf_cache_lock = threading.Lock()
|
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=3)
|
@lru_cache(maxsize=3)
|
||||||
def _get_pdf_pages_cached(filename):
|
def get_pdf_pages(filename):
|
||||||
|
"""Caches the heavy PDF rendering step for the current and next files."""
|
||||||
pdf_path = os.path.join(INPUT_DIR, filename)
|
pdf_path = os.path.join(INPUT_DIR, filename)
|
||||||
return convert_from_path(pdf_path)
|
return convert_from_path(pdf_path)
|
||||||
|
|
||||||
def get_pdf_pages(filename):
|
|
||||||
"""Thread-safe wrapper for the cached PDF conversion."""
|
|
||||||
with pdf_cache_lock:
|
|
||||||
return _get_pdf_pages_cached(filename)
|
|
||||||
|
|
||||||
def process_single_pdf(filename, shift_offset=0, max_per_file=5):
|
def process_single_pdf(filename, shift_offset=0, max_per_file=5):
|
||||||
"""
|
"""
|
||||||
Converts PDF to stitched images.
|
Converts PDF to stitched images.
|
||||||
|
|
@ -146,8 +137,7 @@ def process_single_pdf(filename, shift_offset=0, max_per_file=5):
|
||||||
|
|
||||||
# 3. Generate Preview (All stitched together, Resized)
|
# 3. Generate Preview (All stitched together, Resized)
|
||||||
full_stitch = stitch_images(cropped_images)
|
full_stitch = stitch_images(cropped_images)
|
||||||
# preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.LANCZOS)
|
preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.LANCZOS)
|
||||||
preview_resized = full_stitch.resize(OUTPUT_SIZE, Image.BILINEAR)
|
|
||||||
|
|
||||||
schema = {
|
schema = {
|
||||||
"original_filename": filename,
|
"original_filename": filename,
|
||||||
|
|
@ -210,6 +200,8 @@ class ImageReviewer:
|
||||||
self.current_preview = None # Only stores the resized preview for GUI
|
self.current_preview = None # Only stores the resized preview for GUI
|
||||||
self.is_processing = False
|
self.is_processing = False
|
||||||
|
|
||||||
|
# Queue for pre-fetched results (index, (preview, splits, schema))
|
||||||
|
self.prefetch_queue = Queue(maxsize=1)
|
||||||
# Queue for manual re-processing results
|
# Queue for manual re-processing results
|
||||||
self.manual_queue = Queue()
|
self.manual_queue = Queue()
|
||||||
|
|
||||||
|
|
@ -252,15 +244,19 @@ class ImageReviewer:
|
||||||
self.trigger_processing(self.files[self.index], self.current_shift)
|
self.trigger_processing(self.files[self.index], self.current_shift)
|
||||||
|
|
||||||
def prefetch_worker(self):
|
def prefetch_worker(self):
|
||||||
"""Background thread to load the NEXT file's PDF pages into RAM."""
|
"""Background thread to process the NEXT image constantly."""
|
||||||
idx_to_process = -1
|
idx_to_process = 0
|
||||||
while True:
|
while True:
|
||||||
target = self.index + 1
|
target = self.index + 1
|
||||||
if target < len(self.files) and target != idx_to_process:
|
if target < len(self.files):
|
||||||
|
if idx_to_process != target:
|
||||||
fname = self.files[target]
|
fname = self.files[target]
|
||||||
get_pdf_pages(fname) # Just calling it warms the lru_cache
|
result = process_single_pdf(fname, shift_offset=0)
|
||||||
|
if result:
|
||||||
|
self.prefetch_queue.put((target, result)) # Blocks if full
|
||||||
idx_to_process = target
|
idx_to_process = target
|
||||||
time.sleep(0.05)
|
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
def load_current_image(self, use_prefetch=False):
|
def load_current_image(self, use_prefetch=False):
|
||||||
if self.index >= len(self.files):
|
if self.index >= len(self.files):
|
||||||
|
|
@ -270,9 +266,20 @@ class ImageReviewer:
|
||||||
|
|
||||||
filename = self.files[self.index]
|
filename = self.files[self.index]
|
||||||
self.is_processing = False
|
self.is_processing = False
|
||||||
self.current_shift = 0
|
|
||||||
|
|
||||||
# Always trigger processing. If prefetched, get_pdf_pages returns instantly.
|
result_found = None
|
||||||
|
|
||||||
|
if use_prefetch and not self.prefetch_queue.empty():
|
||||||
|
q_idx, q_result = self.prefetch_queue.queue[0]
|
||||||
|
if q_idx == self.index:
|
||||||
|
_, result_found = self.prefetch_queue.get()
|
||||||
|
self.current_shift = 0
|
||||||
|
print(f"Loaded {filename} from prefetch.")
|
||||||
|
|
||||||
|
if result_found:
|
||||||
|
self.handle_processing_result(result_found, filename)
|
||||||
|
else:
|
||||||
|
# Not in queue (first load or queue mismatch), process manually
|
||||||
self.trigger_processing(filename, self.current_shift)
|
self.trigger_processing(filename, self.current_shift)
|
||||||
|
|
||||||
def trigger_processing(self, filename, shift):
|
def trigger_processing(self, filename, shift):
|
||||||
|
|
|
||||||
|
|
@ -314,7 +314,7 @@ def process_copy_group(group_key, files):
|
||||||
|
|
||||||
# Run ThreadPool on GROUPS (Copies), not individual files
|
# Run ThreadPool on GROUPS (Copies), not individual files
|
||||||
# Each thread handles one student's full exam copy sequentially
|
# Each thread handles one student's full exam copy sequentially
|
||||||
with ThreadPoolExecutor(max_workers=12) as executor:
|
with ThreadPoolExecutor(max_workers=16) as executor:
|
||||||
# Convert dict items to arguments for map
|
# Convert dict items to arguments for map
|
||||||
# executor.map expects a function and an iterable.
|
# executor.map expects a function and an iterable.
|
||||||
# We use a lambda or separate function to unpack the tuple if needed,
|
# We use a lambda or separate function to unpack the tuple if needed,
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,6 @@ import os
|
||||||
import re
|
import re
|
||||||
import glob
|
import glob
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
|
||||||
from pypdf import PdfReader, PdfWriter
|
from pypdf import PdfReader, PdfWriter
|
||||||
|
|
||||||
# --- Constants ---
|
# --- Constants ---
|
||||||
|
|
@ -95,7 +94,7 @@ class PDFPreviewer:
|
||||||
"← / → : Move line 1cm left/right\n"
|
"← / → : Move line 1cm left/right\n"
|
||||||
"'c': Rotate page 180°, 'C' : rotate all pages, ',' : rotate all files\n"
|
"'c': Rotate page 180°, 'C' : rotate all pages, ',' : rotate all files\n"
|
||||||
"t s r n m: keep left, next page, keep none, keep right, keep as is\n"
|
"t s r n m: keep left, next page, keep none, keep right, keep as is\n"
|
||||||
"z: send this page to the end, 'A':pdf arranger 'R':restart file, 'P':back to previous file\n"
|
"z: send this page to the end, 'R':restart file, 'P':back to previous file\n"
|
||||||
)
|
)
|
||||||
self.info_label = tk.Label(master, text=instructions, justify=tk.LEFT)
|
self.info_label = tk.Label(master, text=instructions, justify=tk.LEFT)
|
||||||
self.info_label.pack(pady=5, side=tk.TOP)
|
self.info_label.pack(pady=5, side=tk.TOP)
|
||||||
|
|
@ -124,7 +123,6 @@ class PDFPreviewer:
|
||||||
self.master.bind("r", self.discard_page)
|
self.master.bind("r", self.discard_page)
|
||||||
self.master.bind("z", self.send_page_end)
|
self.master.bind("z", self.send_page_end)
|
||||||
self.master.bind("R", self.restart_current_file)
|
self.master.bind("R", self.restart_current_file)
|
||||||
self.master.bind("A", self.start_arranger)
|
|
||||||
self.master.bind("P", self.go_to_previous_file)
|
self.master.bind("P", self.go_to_previous_file)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -133,9 +131,6 @@ class PDFPreviewer:
|
||||||
|
|
||||||
self.current_zoom = 1.0
|
self.current_zoom = 1.0
|
||||||
|
|
||||||
def start_arranger(self):
|
|
||||||
subprocess.Popen(["pdf-arranger", self.pdf_path])
|
|
||||||
|
|
||||||
def on_resize(self, event):
|
def on_resize(self, event):
|
||||||
"""
|
"""
|
||||||
Handles window resize events by reloading the page.
|
Handles window resize events by reloading the page.
|
||||||
|
|
|
||||||
13
plotting.py
13
plotting.py
|
|
@ -309,18 +309,7 @@ class ImageViewer:
|
||||||
|
|
||||||
def on_open_interro(self, event):
|
def on_open_interro(self, event):
|
||||||
if self.is_viewing and self.current_json_path:
|
if self.is_viewing and self.current_json_path:
|
||||||
# Check local directory first
|
pdf_path = "/home/sebastien/Prépa/Staging/Interro/" + str(base_dir) + ".pdf"
|
||||||
local_accent = self.base_dir / "énoncé.pdf"
|
|
||||||
local_plain = self.base_dir / "enonce.pdf"
|
|
||||||
|
|
||||||
if local_accent.exists():
|
|
||||||
pdf_path = str(local_accent)
|
|
||||||
elif local_plain.exists():
|
|
||||||
pdf_path = str(local_plain)
|
|
||||||
else:
|
|
||||||
# Fallback to the Interro staging directory
|
|
||||||
pdf_path = f"/home/sebastien/Prépa/Staging/Interro/{self.base_dir.name}.pdf"
|
|
||||||
|
|
||||||
print(f"Opening {pdf_path}")
|
print(f"Opening {pdf_path}")
|
||||||
subprocess.Popen(['xdg-open', pdf_path])
|
subprocess.Popen(['xdg-open', pdf_path])
|
||||||
|
|
||||||
|
|
|
||||||
104
splitting_int.py
104
splitting_int.py
|
|
@ -27,7 +27,7 @@ def decode_json(pdf_file):
|
||||||
(b, label) = d["box_2d"], d["label"]
|
(b, label) = d["box_2d"], d["label"]
|
||||||
pn = page_number(b)
|
pn = page_number(b)
|
||||||
carreau = 1000 // 38
|
carreau = 1000 // 38
|
||||||
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau), b[1], b[3]))
|
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau)))
|
||||||
result.sort(key=lambda x: (x[1], x[2]))
|
result.sort(key=lambda x: (x[1], x[2]))
|
||||||
return (name, result)
|
return (name, result)
|
||||||
|
|
||||||
|
|
@ -39,23 +39,11 @@ def split_an_interro(base_dir, input_pdf, coords_list):
|
||||||
generated_files = set()
|
generated_files = set()
|
||||||
parts_by_label = defaultdict(list)
|
parts_by_label = defaultdict(list)
|
||||||
|
|
||||||
# 1. Parse labels to strip '|' and determine type: L (Left), R (Right), N (Normal)
|
# Filter consecutive duplicate labels
|
||||||
parsed_coords = []
|
|
||||||
for item in coords_list:
|
|
||||||
label, pn, y0, y1, x0, x1 = item
|
|
||||||
if label.startswith("|"):
|
|
||||||
c_type, clean_label = "L", label[1:]
|
|
||||||
elif label.endswith("|"):
|
|
||||||
c_type, clean_label = "R", label[:-1]
|
|
||||||
else:
|
|
||||||
c_type, clean_label = "N", label
|
|
||||||
parsed_coords.append((clean_label, c_type, pn, y0, y1, x0, x1))
|
|
||||||
|
|
||||||
# 2. Filter consecutive duplicate labels based on the cleaned name
|
|
||||||
filtered_coords = []
|
filtered_coords = []
|
||||||
if parsed_coords:
|
if coords_list:
|
||||||
filtered_coords.append(parsed_coords[0])
|
filtered_coords.append(coords_list[0])
|
||||||
for item in parsed_coords[1:]:
|
for item in coords_list[1:]:
|
||||||
if item[0] != filtered_coords[-1][0]:
|
if item[0] != filtered_coords[-1][0]:
|
||||||
filtered_coords.append(item)
|
filtered_coords.append(item)
|
||||||
coords_list = filtered_coords
|
coords_list = filtered_coords
|
||||||
|
|
@ -65,11 +53,11 @@ def split_an_interro(base_dir, input_pdf, coords_list):
|
||||||
page_height = page.rect.height
|
page_height = page.rect.height
|
||||||
return (y / 1000) * page_height
|
return (y / 1000) * page_height
|
||||||
|
|
||||||
def save_cropped_page(doc, page_num, x0, y0, x1, y1, out_path):
|
def save_cropped_page(doc, page_num, y0, y1, out_path):
|
||||||
"""Saves a cropped portion of a page as a new PDF."""
|
"""Saves a cropped portion of a page as a new PDF."""
|
||||||
page = doc[page_num]
|
page = doc[page_num]
|
||||||
rotated_rect = page.rect * page.transformation_matrix
|
rotated_rect = page.rect * page.transformation_matrix
|
||||||
visual_crop_rect = fitz.Rect(rotated_rect.x0 + x0, y0, rotated_rect.x0 + x1, y1)
|
visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1)
|
||||||
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
|
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
|
||||||
|
|
||||||
temp_doc = fitz.open()
|
temp_doc = fitz.open()
|
||||||
|
|
@ -88,70 +76,46 @@ def split_an_interro(base_dir, input_pdf, coords_list):
|
||||||
temp_doc.close()
|
temp_doc.close()
|
||||||
|
|
||||||
# Iterate through all labels
|
# Iterate through all labels
|
||||||
for idx, (clean_label, c_type, start_page, y_start_raw, y_end_box, x0_raw, x1_raw) in enumerate(coords_list):
|
for idx, (title, start_page, y_start_raw, _) in enumerate(coords_list):
|
||||||
if clean_label == "_":
|
|
||||||
continue
|
|
||||||
|
|
||||||
temp_parts = []
|
temp_parts = []
|
||||||
|
|
||||||
|
# Determine the stopping point for this label
|
||||||
|
if idx + 1 < len(coords_list):
|
||||||
|
# Normal case: stop at the next label
|
||||||
|
_, end_page, _, y_end_raw = coords_list[idx + 1]
|
||||||
|
end_y_target_raw = y_end_raw
|
||||||
|
else:
|
||||||
|
# FIX BUG 1: Last label extends to the very end of the document
|
||||||
end_page = doc.page_count - 1
|
end_page = doc.page_count - 1
|
||||||
end_y_target_raw = 1000
|
end_y_target_raw = 1000 # 1000 represents full height
|
||||||
|
|
||||||
# RULE 2: Determine stopping label
|
|
||||||
for next_item in coords_list[idx + 1:]:
|
|
||||||
n_clean, n_type, n_pn, n_y_start, _, _, _ = next_item
|
|
||||||
|
|
||||||
if c_type == "L":
|
|
||||||
is_stop = (n_type in ("L", "N"))
|
|
||||||
elif c_type == "R":
|
|
||||||
is_stop = (n_type in ("R", "N"))
|
|
||||||
else:
|
|
||||||
is_stop = True # Normal labels stop at anything
|
|
||||||
|
|
||||||
if is_stop:
|
|
||||||
end_page = n_pn
|
|
||||||
end_y_target_raw = n_y_start
|
|
||||||
break
|
|
||||||
|
|
||||||
# RULES 3 & 4: Calculate horizontal boundaries (0.0 to 1.0 fraction of local page width)
|
|
||||||
col_w = 1000 / doc.page_count
|
|
||||||
if c_type == "L": # |name
|
|
||||||
fraction_x0 = (x0_raw % col_w) / col_w
|
|
||||||
fraction_x1 = 1.0
|
|
||||||
end_y_target_raw = min(1000, end_y_target_raw + 40)
|
|
||||||
elif c_type == "R": # name|
|
|
||||||
fraction_x0 = 0.0
|
|
||||||
# Find the closest 'L' label in y-distance
|
|
||||||
L_labels = [it for it in parsed_coords if it[1] == "L"]
|
|
||||||
if L_labels:
|
|
||||||
closest_L = min(L_labels, key=lambda it: abs(it[3] - y_start_raw))
|
|
||||||
closest_L_x_center = (closest_L[5] + closest_L[6]) / 2.0
|
|
||||||
fraction_x1 = (closest_L_x_center % col_w) / col_w
|
|
||||||
if fraction_x1 <= fraction_x0: fraction_x1 = 1.0 # Fallback
|
|
||||||
else:
|
|
||||||
fraction_x1 = 1.0
|
|
||||||
else: # Normal
|
|
||||||
fraction_x0 = 0.0
|
|
||||||
fraction_x1 = 1.0
|
|
||||||
|
|
||||||
|
# FIX BUG 2: Iterate through EVERY page from start to end
|
||||||
|
# This handles cases where start_page == end_page, start_page + 1 == end_page,
|
||||||
|
# AND start_page + N == end_page (gaps)
|
||||||
current_p = start_page
|
current_p = start_page
|
||||||
while current_p <= end_page:
|
while current_p <= end_page:
|
||||||
page = doc[current_p]
|
|
||||||
|
|
||||||
y0 = scale_coord(y_start_raw, page) if current_p == start_page else 0
|
# Determine Top Cut (y0)
|
||||||
y1 = scale_coord(end_y_target_raw, page) if current_p == end_page else page.rect.height
|
if current_p == start_page:
|
||||||
|
y0 = scale_coord(y_start_raw, doc[current_p])
|
||||||
|
else:
|
||||||
|
y0 = 0 # Start from top of page for intermediate/last pages
|
||||||
|
|
||||||
|
# Determine Bottom Cut (y1)
|
||||||
|
if current_p == end_page:
|
||||||
|
y1 = scale_coord(end_y_target_raw, doc[current_p])
|
||||||
|
else:
|
||||||
|
y1 = doc[current_p].rect.height # Go to bottom of intermediate pages
|
||||||
|
|
||||||
|
# Only save if the slice has height (avoid empty files)
|
||||||
if y1 > y0 + 1:
|
if y1 > y0 + 1:
|
||||||
# Convert fractions to absolute PDF points
|
|
||||||
x0_pdf = fraction_x0 * page.rect.width
|
|
||||||
x1_pdf = fraction_x1 * page.rect.width
|
|
||||||
|
|
||||||
temp_path = f"_part_{idx}_{current_p}.pdf"
|
temp_path = f"_part_{idx}_{current_p}.pdf"
|
||||||
save_cropped_page(doc, current_p, x0_pdf, y0, x1_pdf, y1, temp_path)
|
save_cropped_page(doc, current_p, y0, y1, temp_path)
|
||||||
temp_parts.append(temp_path)
|
temp_parts.append(temp_path)
|
||||||
|
|
||||||
current_p += 1
|
current_p += 1
|
||||||
|
|
||||||
parts_by_label[clean_label].extend(temp_parts)
|
parts_by_label[title].extend(temp_parts)
|
||||||
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
|
||||||
5
utils.py
5
utils.py
|
|
@ -14,7 +14,10 @@ def enonce_total(base_dir):
|
||||||
if not text_dir.is_dir():
|
if not text_dir.is_dir():
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
files = [f for f in text_dir.iterdir() if f.is_file()]
|
# Exclude .tex and .pdf files
|
||||||
|
files = [f for f in text_dir.iterdir()
|
||||||
|
if f.is_file() and f.suffix.lower() not in ('.tex', '.pdf')]
|
||||||
|
|
||||||
files.sort(key=lambda f: natural_key(f.name))
|
files.sort(key=lambda f: natural_key(f.name))
|
||||||
|
|
||||||
output = []
|
output = []
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue