Initial support for "_", "|_" et "…|" in label names

master
Sébastien Miquel 2026-05-08 22:20:54 +02:00
parent 26d02b0987
commit 8d9165d0ac
4 changed files with 84 additions and 41 deletions

View File

@ -1,7 +1,7 @@
#+title: Script #+title: Script
#+author: Sébastien Miquel #+author: Sébastien Miquel
#+date: 14-03-2026 #+date: 14-03-2026
# Time-stamp: <07-05-26 11:33> # Time-stamp: <08-05-26 22:18>
#+OPTIONS: #+OPTIONS:
* Quézaco * Quézaco
@ -106,7 +106,13 @@ export GEMINI_API_KEY=…
labels des exercices/questions. labels des exercices/questions.
Rerun on a single file with =python cutleft.py Interro/Copie01.pdf= Rerun on a single file with =python cutleft.py Interro/Copie01.pdf=
5. =python enonce_info.py Interro= (gestion perso)
* Génération d'information sur l'énoncé
1. =python enonce_info.py Interro= (gestion perso)
OU
2. =python gemini_for_enonce.py Interro=
+ Nécessite =enonce.tex/org= et `correction.tex/org`
* Labelisation et regroupement * Labelisation et regroupement
@ -124,7 +130,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
+ Quand un label est manquant, il est possible de cliquer sur + Quand un label est manquant, il est possible de cliquer sur
l'image, ce qui copie les coordonnées dans le presse papier l'image, ce qui copie les coordonnées dans le presse papier
(sous linux…), puis on peut l'ajouter à la main. (sous linux…), puis on peut l'ajouter à la main.
+ Utilisation de `_`, `|…` et `…|`
Pour modifier une seule copie : Pour modifier une seule copie :
=python plotting.py Interro/Copie01.pdf= =python plotting.py Interro/Copie01.pdf=

View File

@ -157,7 +157,7 @@ def generate_request(file, labels, names, context_labels):
] ]
generate_content_config = types.GenerateContentConfig( generate_content_config = types.GenerateContentConfig(
temperature=1.0, temperature=1.,
top_p=0.95, top_p=0.95,
seed=0, seed=0,
max_output_tokens=65535, max_output_tokens=65535,

View File

@ -27,7 +27,7 @@ def decode_json(pdf_file):
(b, label) = d["box_2d"], d["label"] (b, label) = d["box_2d"], d["label"]
pn = page_number(b) pn = page_number(b)
carreau = 1000 // 38 carreau = 1000 // 38
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau))) result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau), b[1], b[3]))
result.sort(key=lambda x: (x[1], x[2])) result.sort(key=lambda x: (x[1], x[2]))
return (name, result) return (name, result)
@ -39,11 +39,23 @@ def split_an_interro(base_dir, input_pdf, coords_list):
generated_files = set() generated_files = set()
parts_by_label = defaultdict(list) parts_by_label = defaultdict(list)
# Filter consecutive duplicate labels # 1. Parse labels to strip '|' and determine type: L (Left), R (Right), N (Normal)
parsed_coords = []
for item in coords_list:
label, pn, y0, y1, x0, x1 = item
if label.startswith("|"):
c_type, clean_label = "L", label[1:]
elif label.endswith("|"):
c_type, clean_label = "R", label[:-1]
else:
c_type, clean_label = "N", label
parsed_coords.append((clean_label, c_type, pn, y0, y1, x0, x1))
# 2. Filter consecutive duplicate labels based on the cleaned name
filtered_coords = [] filtered_coords = []
if coords_list: if parsed_coords:
filtered_coords.append(coords_list[0]) filtered_coords.append(parsed_coords[0])
for item in coords_list[1:]: for item in parsed_coords[1:]:
if item[0] != filtered_coords[-1][0]: if item[0] != filtered_coords[-1][0]:
filtered_coords.append(item) filtered_coords.append(item)
coords_list = filtered_coords coords_list = filtered_coords
@ -53,11 +65,11 @@ def split_an_interro(base_dir, input_pdf, coords_list):
page_height = page.rect.height page_height = page.rect.height
return (y / 1000) * page_height return (y / 1000) * page_height
def save_cropped_page(doc, page_num, y0, y1, out_path): def save_cropped_page(doc, page_num, x0, y0, x1, y1, out_path):
"""Saves a cropped portion of a page as a new PDF.""" """Saves a cropped portion of a page as a new PDF."""
page = doc[page_num] page = doc[page_num]
rotated_rect = page.rect * page.transformation_matrix rotated_rect = page.rect * page.transformation_matrix
visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1) visual_crop_rect = fitz.Rect(rotated_rect.x0 + x0, y0, rotated_rect.x0 + x1, y1)
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
temp_doc = fitz.open() temp_doc = fitz.open()
@ -76,46 +88,70 @@ def split_an_interro(base_dir, input_pdf, coords_list):
temp_doc.close() temp_doc.close()
# Iterate through all labels # Iterate through all labels
for idx, (title, start_page, y_start_raw, _) in enumerate(coords_list): for idx, (clean_label, c_type, start_page, y_start_raw, y_end_box, x0_raw, x1_raw) in enumerate(coords_list):
if clean_label == "_":
continue
temp_parts = [] temp_parts = []
end_page = doc.page_count - 1
end_y_target_raw = 1000
# Determine the stopping point for this label # RULE 2: Determine stopping label
if idx + 1 < len(coords_list): for next_item in coords_list[idx + 1:]:
# Normal case: stop at the next label n_clean, n_type, n_pn, n_y_start, _, _, _ = next_item
_, end_page, _, y_end_raw = coords_list[idx + 1]
end_y_target_raw = y_end_raw if c_type == "L":
else: is_stop = (n_type in ("L", "N"))
# FIX BUG 1: Last label extends to the very end of the document elif c_type == "R":
end_page = doc.page_count - 1 is_stop = (n_type in ("R", "N"))
end_y_target_raw = 1000 # 1000 represents full height else:
is_stop = True # Normal labels stop at anything
if is_stop:
end_page = n_pn
end_y_target_raw = n_y_start
break
# RULES 3 & 4: Calculate horizontal boundaries (0.0 to 1.0 fraction of local page width)
col_w = 1000 / doc.page_count
if c_type == "L": # |name
fraction_x0 = (x0_raw % col_w) / col_w
fraction_x1 = 1.0
end_y_target_raw = min(1000, end_y_target_raw + 40)
elif c_type == "R": # name|
fraction_x0 = 0.0
# Find the closest 'L' label in y-distance
L_labels = [it for it in parsed_coords if it[1] == "L"]
if L_labels:
closest_L = min(L_labels, key=lambda it: abs(it[3] - y_start_raw))
closest_L_x_center = (closest_L[5] + closest_L[6]) / 2.0
fraction_x1 = (closest_L_x_center % col_w) / col_w
if fraction_x1 <= fraction_x0: fraction_x1 = 1.0 # Fallback
else:
fraction_x1 = 1.0
else: # Normal
fraction_x0 = 0.0
fraction_x1 = 1.0
# FIX BUG 2: Iterate through EVERY page from start to end
# This handles cases where start_page == end_page, start_page + 1 == end_page,
# AND start_page + N == end_page (gaps)
current_p = start_page current_p = start_page
while current_p <= end_page: while current_p <= end_page:
page = doc[current_p]
# Determine Top Cut (y0) y0 = scale_coord(y_start_raw, page) if current_p == start_page else 0
if current_p == start_page: y1 = scale_coord(end_y_target_raw, page) if current_p == end_page else page.rect.height
y0 = scale_coord(y_start_raw, doc[current_p])
else:
y0 = 0 # Start from top of page for intermediate/last pages
# Determine Bottom Cut (y1)
if current_p == end_page:
y1 = scale_coord(end_y_target_raw, doc[current_p])
else:
y1 = doc[current_p].rect.height # Go to bottom of intermediate pages
# Only save if the slice has height (avoid empty files)
if y1 > y0 + 1: if y1 > y0 + 1:
# Convert fractions to absolute PDF points
x0_pdf = fraction_x0 * page.rect.width
x1_pdf = fraction_x1 * page.rect.width
temp_path = f"_part_{idx}_{current_p}.pdf" temp_path = f"_part_{idx}_{current_p}.pdf"
save_cropped_page(doc, current_p, y0, y1, temp_path) save_cropped_page(doc, current_p, x0_pdf, y0, x1_pdf, y1, temp_path)
temp_parts.append(temp_path) temp_parts.append(temp_path)
current_p += 1 current_p += 1
parts_by_label[title].extend(temp_parts) parts_by_label[clean_label].extend(temp_parts)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)

View File

@ -5,9 +5,10 @@ def natural_key(text):
return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))] return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]
def read_all_labels(base_dir): def read_all_labels(base_dir):
return sorted(list(filter(None, # return sorted(list(filter(None,
(Path(base_dir) / "labels").read_text().splitlines())), # (Path(base_dir) / "labels").read_text().splitlines())),
key = natural_key) # key = natural_key)
return list(filter(None, (Path(base_dir) / "labels").read_text().splitlines()))
def enonce_total(base_dir): def enonce_total(base_dir):
text_dir = Path(base_dir) / 'Text' text_dir = Path(base_dir) / 'Text'