Initial support for "_", "|_" et "…|" in label names
parent
26d02b0987
commit
8d9165d0ac
12
Readme.org
12
Readme.org
|
|
@ -1,7 +1,7 @@
|
||||||
#+title: Script
|
#+title: Script
|
||||||
#+author: Sébastien Miquel
|
#+author: Sébastien Miquel
|
||||||
#+date: 14-03-2026
|
#+date: 14-03-2026
|
||||||
# Time-stamp: <07-05-26 11:33>
|
# Time-stamp: <08-05-26 22:18>
|
||||||
#+OPTIONS:
|
#+OPTIONS:
|
||||||
|
|
||||||
* Quézaco
|
* Quézaco
|
||||||
|
|
@ -106,7 +106,13 @@ export GEMINI_API_KEY=…
|
||||||
labels des exercices/questions.
|
labels des exercices/questions.
|
||||||
|
|
||||||
Rerun on a single file with =python cutleft.py Interro/Copie01.pdf=
|
Rerun on a single file with =python cutleft.py Interro/Copie01.pdf=
|
||||||
5. =python enonce_info.py Interro= (gestion perso)
|
|
||||||
|
* Génération d'information sur l'énoncé
|
||||||
|
|
||||||
|
1. =python enonce_info.py Interro= (gestion perso)
|
||||||
|
OU
|
||||||
|
2. =python gemini_for_enonce.py Interro=
|
||||||
|
+ Nécessite =enonce.tex/org= et `correction.tex/org`
|
||||||
|
|
||||||
* Labelisation et regroupement
|
* Labelisation et regroupement
|
||||||
|
|
||||||
|
|
@ -124,7 +130,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
|
||||||
+ Quand un label est manquant, il est possible de cliquer sur
|
+ Quand un label est manquant, il est possible de cliquer sur
|
||||||
l'image, ce qui copie les coordonnées dans le presse papier
|
l'image, ce qui copie les coordonnées dans le presse papier
|
||||||
(sous linux…), puis on peut l'ajouter à la main.
|
(sous linux…), puis on peut l'ajouter à la main.
|
||||||
|
+ Utilisation de `_`, `|…` et `…|`
|
||||||
Pour modifier une seule copie :
|
Pour modifier une seule copie :
|
||||||
=python plotting.py Interro/Copie01.pdf=
|
=python plotting.py Interro/Copie01.pdf=
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -157,7 +157,7 @@ def generate_request(file, labels, names, context_labels):
|
||||||
]
|
]
|
||||||
|
|
||||||
generate_content_config = types.GenerateContentConfig(
|
generate_content_config = types.GenerateContentConfig(
|
||||||
temperature=1.0,
|
temperature=1.,
|
||||||
top_p=0.95,
|
top_p=0.95,
|
||||||
seed=0,
|
seed=0,
|
||||||
max_output_tokens=65535,
|
max_output_tokens=65535,
|
||||||
|
|
|
||||||
104
splitting_int.py
104
splitting_int.py
|
|
@ -27,7 +27,7 @@ def decode_json(pdf_file):
|
||||||
(b, label) = d["box_2d"], d["label"]
|
(b, label) = d["box_2d"], d["label"]
|
||||||
pn = page_number(b)
|
pn = page_number(b)
|
||||||
carreau = 1000 // 38
|
carreau = 1000 // 38
|
||||||
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau)))
|
result.append((label, pn, b[0] - int(carreau), b[2]-int(carreau), b[1], b[3]))
|
||||||
result.sort(key=lambda x: (x[1], x[2]))
|
result.sort(key=lambda x: (x[1], x[2]))
|
||||||
return (name, result)
|
return (name, result)
|
||||||
|
|
||||||
|
|
@ -39,11 +39,23 @@ def split_an_interro(base_dir, input_pdf, coords_list):
|
||||||
generated_files = set()
|
generated_files = set()
|
||||||
parts_by_label = defaultdict(list)
|
parts_by_label = defaultdict(list)
|
||||||
|
|
||||||
# Filter consecutive duplicate labels
|
# 1. Parse labels to strip '|' and determine type: L (Left), R (Right), N (Normal)
|
||||||
|
parsed_coords = []
|
||||||
|
for item in coords_list:
|
||||||
|
label, pn, y0, y1, x0, x1 = item
|
||||||
|
if label.startswith("|"):
|
||||||
|
c_type, clean_label = "L", label[1:]
|
||||||
|
elif label.endswith("|"):
|
||||||
|
c_type, clean_label = "R", label[:-1]
|
||||||
|
else:
|
||||||
|
c_type, clean_label = "N", label
|
||||||
|
parsed_coords.append((clean_label, c_type, pn, y0, y1, x0, x1))
|
||||||
|
|
||||||
|
# 2. Filter consecutive duplicate labels based on the cleaned name
|
||||||
filtered_coords = []
|
filtered_coords = []
|
||||||
if coords_list:
|
if parsed_coords:
|
||||||
filtered_coords.append(coords_list[0])
|
filtered_coords.append(parsed_coords[0])
|
||||||
for item in coords_list[1:]:
|
for item in parsed_coords[1:]:
|
||||||
if item[0] != filtered_coords[-1][0]:
|
if item[0] != filtered_coords[-1][0]:
|
||||||
filtered_coords.append(item)
|
filtered_coords.append(item)
|
||||||
coords_list = filtered_coords
|
coords_list = filtered_coords
|
||||||
|
|
@ -53,11 +65,11 @@ def split_an_interro(base_dir, input_pdf, coords_list):
|
||||||
page_height = page.rect.height
|
page_height = page.rect.height
|
||||||
return (y / 1000) * page_height
|
return (y / 1000) * page_height
|
||||||
|
|
||||||
def save_cropped_page(doc, page_num, y0, y1, out_path):
|
def save_cropped_page(doc, page_num, x0, y0, x1, y1, out_path):
|
||||||
"""Saves a cropped portion of a page as a new PDF."""
|
"""Saves a cropped portion of a page as a new PDF."""
|
||||||
page = doc[page_num]
|
page = doc[page_num]
|
||||||
rotated_rect = page.rect * page.transformation_matrix
|
rotated_rect = page.rect * page.transformation_matrix
|
||||||
visual_crop_rect = fitz.Rect(rotated_rect.x0, y0, rotated_rect.x1, y1)
|
visual_crop_rect = fitz.Rect(rotated_rect.x0 + x0, y0, rotated_rect.x0 + x1, y1)
|
||||||
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
|
unrotated_clip_rect = visual_crop_rect * page.derotation_matrix
|
||||||
|
|
||||||
temp_doc = fitz.open()
|
temp_doc = fitz.open()
|
||||||
|
|
@ -76,46 +88,70 @@ def split_an_interro(base_dir, input_pdf, coords_list):
|
||||||
temp_doc.close()
|
temp_doc.close()
|
||||||
|
|
||||||
# Iterate through all labels
|
# Iterate through all labels
|
||||||
for idx, (title, start_page, y_start_raw, _) in enumerate(coords_list):
|
for idx, (clean_label, c_type, start_page, y_start_raw, y_end_box, x0_raw, x1_raw) in enumerate(coords_list):
|
||||||
|
if clean_label == "_":
|
||||||
|
continue
|
||||||
|
|
||||||
temp_parts = []
|
temp_parts = []
|
||||||
|
|
||||||
# Determine the stopping point for this label
|
|
||||||
if idx + 1 < len(coords_list):
|
|
||||||
# Normal case: stop at the next label
|
|
||||||
_, end_page, _, y_end_raw = coords_list[idx + 1]
|
|
||||||
end_y_target_raw = y_end_raw
|
|
||||||
else:
|
|
||||||
# FIX BUG 1: Last label extends to the very end of the document
|
|
||||||
end_page = doc.page_count - 1
|
end_page = doc.page_count - 1
|
||||||
end_y_target_raw = 1000 # 1000 represents full height
|
end_y_target_raw = 1000
|
||||||
|
|
||||||
|
# RULE 2: Determine stopping label
|
||||||
|
for next_item in coords_list[idx + 1:]:
|
||||||
|
n_clean, n_type, n_pn, n_y_start, _, _, _ = next_item
|
||||||
|
|
||||||
|
if c_type == "L":
|
||||||
|
is_stop = (n_type in ("L", "N"))
|
||||||
|
elif c_type == "R":
|
||||||
|
is_stop = (n_type in ("R", "N"))
|
||||||
|
else:
|
||||||
|
is_stop = True # Normal labels stop at anything
|
||||||
|
|
||||||
|
if is_stop:
|
||||||
|
end_page = n_pn
|
||||||
|
end_y_target_raw = n_y_start
|
||||||
|
break
|
||||||
|
|
||||||
|
# RULES 3 & 4: Calculate horizontal boundaries (0.0 to 1.0 fraction of local page width)
|
||||||
|
col_w = 1000 / doc.page_count
|
||||||
|
if c_type == "L": # |name
|
||||||
|
fraction_x0 = (x0_raw % col_w) / col_w
|
||||||
|
fraction_x1 = 1.0
|
||||||
|
end_y_target_raw = min(1000, end_y_target_raw + 40)
|
||||||
|
elif c_type == "R": # name|
|
||||||
|
fraction_x0 = 0.0
|
||||||
|
# Find the closest 'L' label in y-distance
|
||||||
|
L_labels = [it for it in parsed_coords if it[1] == "L"]
|
||||||
|
if L_labels:
|
||||||
|
closest_L = min(L_labels, key=lambda it: abs(it[3] - y_start_raw))
|
||||||
|
closest_L_x_center = (closest_L[5] + closest_L[6]) / 2.0
|
||||||
|
fraction_x1 = (closest_L_x_center % col_w) / col_w
|
||||||
|
if fraction_x1 <= fraction_x0: fraction_x1 = 1.0 # Fallback
|
||||||
|
else:
|
||||||
|
fraction_x1 = 1.0
|
||||||
|
else: # Normal
|
||||||
|
fraction_x0 = 0.0
|
||||||
|
fraction_x1 = 1.0
|
||||||
|
|
||||||
# FIX BUG 2: Iterate through EVERY page from start to end
|
|
||||||
# This handles cases where start_page == end_page, start_page + 1 == end_page,
|
|
||||||
# AND start_page + N == end_page (gaps)
|
|
||||||
current_p = start_page
|
current_p = start_page
|
||||||
while current_p <= end_page:
|
while current_p <= end_page:
|
||||||
|
page = doc[current_p]
|
||||||
|
|
||||||
# Determine Top Cut (y0)
|
y0 = scale_coord(y_start_raw, page) if current_p == start_page else 0
|
||||||
if current_p == start_page:
|
y1 = scale_coord(end_y_target_raw, page) if current_p == end_page else page.rect.height
|
||||||
y0 = scale_coord(y_start_raw, doc[current_p])
|
|
||||||
else:
|
|
||||||
y0 = 0 # Start from top of page for intermediate/last pages
|
|
||||||
|
|
||||||
# Determine Bottom Cut (y1)
|
|
||||||
if current_p == end_page:
|
|
||||||
y1 = scale_coord(end_y_target_raw, doc[current_p])
|
|
||||||
else:
|
|
||||||
y1 = doc[current_p].rect.height # Go to bottom of intermediate pages
|
|
||||||
|
|
||||||
# Only save if the slice has height (avoid empty files)
|
|
||||||
if y1 > y0 + 1:
|
if y1 > y0 + 1:
|
||||||
|
# Convert fractions to absolute PDF points
|
||||||
|
x0_pdf = fraction_x0 * page.rect.width
|
||||||
|
x1_pdf = fraction_x1 * page.rect.width
|
||||||
|
|
||||||
temp_path = f"_part_{idx}_{current_p}.pdf"
|
temp_path = f"_part_{idx}_{current_p}.pdf"
|
||||||
save_cropped_page(doc, current_p, y0, y1, temp_path)
|
save_cropped_page(doc, current_p, x0_pdf, y0, x1_pdf, y1, temp_path)
|
||||||
temp_parts.append(temp_path)
|
temp_parts.append(temp_path)
|
||||||
|
|
||||||
current_p += 1
|
current_p += 1
|
||||||
|
|
||||||
parts_by_label[title].extend(temp_parts)
|
parts_by_label[clean_label].extend(temp_parts)
|
||||||
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
|
||||||
7
utils.py
7
utils.py
|
|
@ -5,9 +5,10 @@ def natural_key(text):
|
||||||
return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]
|
return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]
|
||||||
|
|
||||||
def read_all_labels(base_dir):
|
def read_all_labels(base_dir):
|
||||||
return sorted(list(filter(None,
|
# return sorted(list(filter(None,
|
||||||
(Path(base_dir) / "labels").read_text().splitlines())),
|
# (Path(base_dir) / "labels").read_text().splitlines())),
|
||||||
key = natural_key)
|
# key = natural_key)
|
||||||
|
return list(filter(None, (Path(base_dir) / "labels").read_text().splitlines()))
|
||||||
|
|
||||||
def enonce_total(base_dir):
|
def enonce_total(base_dir):
|
||||||
text_dir = Path(base_dir) / 'Text'
|
text_dir = Path(base_dir) / 'Text'
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue