master
Sébastien Miquel 2026-04-14 09:16:17 +02:00
parent 822a531679
commit 882c9b64ba
5 changed files with 21106 additions and 39 deletions

View File

@ -623,25 +623,6 @@ def process_single_task(task_tuple):
needs_correction.append(i) needs_correction.append(i)
break break
#
# if ymin < yming-50 or ymax > ymaxg+50:
# print("Error : Gemini answered box2d too low/up", pid, label, group_name)
# if ymax < yming or ymin > ymaxg:
# print("Removing the box.")
# f["box_2d"] = None
# continue
# nymin = max(ymin, yming) * 1000 // total_height
# nymax = min(ymax, ymaxg) * 1000 // total_height
# f["box_2d"] = [nymin, xmin, nymax, xmax]
# if f["box_2d"] and xmax / 1000 > width_r:
# print("Error : Gemini answered box2d too right", pid, label, group_name)
# if xmin / 1000 > width_r:
# print("Removing the box.")
# f["box_2d"] = None
# continue
# f["box_2d"][3] = int(width_r * 1000)
if needs_correction: if needs_correction:
tprint(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...") tprint(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...")
try: try:

View File

@ -211,7 +211,7 @@ for path_str in args.input_paths:
labels_txt = (INPUT_DIR / "labels").read_text() labels_txt = (INPUT_DIR / "labels").read_text()
valid_labels_set = set(line.strip() for line in labels_txt.splitlines() if line.strip()) valid_labels_set = set(line.strip() for line in labels_txt.splitlines() if line.strip())
names_path = (INPUT_DIR / "names") names_path = (INPUT_DIR / "names")
if !os.path.exists(names_path): if not os.path.exists(names_path):
names_path = Path("names") names_path = Path("names")
names_txt = names_path.read_text() names_txt = names_path.read_text()

21074
liste_francais.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,7 @@ from tkinter import messagebox
from pathlib import Path from pathlib import Path
from PIL import Image, ImageDraw, ImageFont, ImageTk from PIL import Image, ImageDraw, ImageFont, ImageTk
print("o to open pdf, O original pdf, e to emacs part, click for coordinates") print("o to open pdf, O original pdf, e to emacs part, i to interro, click for coordinates")
# --- Configuration & Globals --- # --- Configuration & Globals ---
padding = 60 padding = 60
@ -54,7 +54,7 @@ def convert_list(l, group_id, json_schema):
ll.append(ee) ll.append(ee)
return ll return ll
def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages): def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages, last_label_index):
im = Image.open(image_path) im = Image.open(image_path)
im.load() im.load()
width, height = im.size width, height = im.size
@ -62,7 +62,6 @@ def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages):
new_im.paste(im, (0, 0)) new_im.paste(im, (0, 0))
draw = ImageDraw.Draw(new_im) draw = ImageDraw.Draw(new_im)
bounding_boxes.sort(key=lambda b: (page_number(b["box_2d"], nb_pages), b["box_2d"][0])) bounding_boxes.sort(key=lambda b: (page_number(b["box_2d"], nb_pages), b["box_2d"][0]))
last_label_index = -1
for bbox in bounding_boxes: for bbox in bounding_boxes:
raw_y_min = int(bbox["box_2d"][0] * height / 1000) raw_y_min = int(bbox["box_2d"][0] * height / 1000)
@ -88,7 +87,7 @@ def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages):
draw.text((abs_x_min + 8, abs_y_min - 30), label, fill=color, font=font) draw.text((abs_x_min + 8, abs_y_min - 30), label, fill=color, font=font)
else: else:
draw.text((abs_x_min + 8, abs_y_max + 6), label, fill=color, font=font) draw.text((abs_x_min + 8, abs_y_max + 6), label, fill=color, font=font)
return new_im return (new_im, last_label_index)
# --- Processing Logic (Worker Thread) --- # --- Processing Logic (Worker Thread) ---
@ -97,10 +96,15 @@ def worker_thread(base_dir, files_to_process, all_labels):
Iterates through files, prepares VISUALS only, and puts metadata in queue. Iterates through files, prepares VISUALS only, and puts metadata in queue.
Does NOT write final JSON files anymore. Does NOT write final JSON files anymore.
""" """
previous_copie = None
last_label_index = None
for img_path in files_to_process: for img_path in files_to_process:
json_path = base_dir / f"{img_path.stem}.json" json_path = base_dir / f"{img_path.stem}.json"
copie_part = int(img_path.stem[-2:]) copie_part = int(img_path.stem[-2:])
copie = img_path.stem[:-3] copie = img_path.stem[:-3]
if copie != previous_copie:
last_label_index = -1
previous_copie = copie
json_schema_path = base_dir / 'Cutleft' / f"{copie}_schema.json" json_schema_path = base_dir / 'Cutleft' / f"{copie}_schema.json"
try: try:
@ -127,7 +131,8 @@ def worker_thread(base_dir, files_to_process, all_labels):
try: try:
print(f"Buffering {img_path.name}...") print(f"Buffering {img_path.name}...")
pil_image = prepare_image(str(img_path), bb_list, all_labels, nb_pages) (pil_image, last_label_index) = \
prepare_image(str(img_path), bb_list, all_labels, nb_pages, last_label_index)
metadata = { metadata = {
"copie": copie, "copie": copie,
@ -169,6 +174,7 @@ class ImageViewer:
self.root.bind('<Return>', self.on_enter) self.root.bind('<Return>', self.on_enter)
self.root.bind('e', self.on_edit) self.root.bind('e', self.on_edit)
self.root.bind('o', self.on_open_pdf) self.root.bind('o', self.on_open_pdf)
self.root.bind('i', self.on_open_interro)
self.root.bind('O', self.on_open_ori_pdf) self.root.bind('O', self.on_open_ori_pdf)
self.root.bind('<Escape>', lambda e: self.root.quit()) self.root.bind('<Escape>', lambda e: self.root.quit())
self.label.bind('<Button-1>', self.on_click) self.label.bind('<Button-1>', self.on_click)
@ -265,6 +271,12 @@ class ImageViewer:
subprocess.Popen(['xdg-open', str(pdf_path.absolute())]) subprocess.Popen(['xdg-open', str(pdf_path.absolute())])
def on_open_ori_pdf(self, event): def on_open_ori_pdf(self, event):
if self.is_viewing and self.current_json_path:
pdf_path = "/home/sebastien/Staging/Interro/" + base_dir + "pdf"
print(f"Opening {pdf_path}")
subprocess.Popen(['xdg-open', pdf_path])
def on_open_interro(self, event):
if self.is_viewing and self.current_json_path: if self.is_viewing and self.current_json_path:
new_filename = self.current_json_path.stem.split('_')[0] + ".pdf" new_filename = self.current_json_path.stem.split('_')[0] + ".pdf"
pdf_path = self.current_json_path.parent / "Copies Originales" / new_filename pdf_path = self.current_json_path.parent / "Copies Originales" / new_filename

View File

@ -20,28 +20,28 @@ import ftfy
import re import re
import urllib.request import urllib.request
# url = "https://raw.githubusercontent.com/hbenbel/French-Dictionary/master/dictionary/dictionary.txt" with open('liste_francais.txt', 'r') as f:
# french_words = urllib.request.urlopen(url).read().decode('utf-8').splitlines() french_words = f.read().splitlines()
# 2. Pre-compute an O(1) lookup dictionary # 2. Pre-compute an O(1) lookup dictionary
# We simulate the corruption by replacing accents with null bytes (\x00) # We simulate the corruption by replacing accents with null bytes (\x00)
# lookup_map = {} lookup_map = {}
# for word in french_words: for word in french_words:
# # Replace all French accents with \x00 to create the "broken" key # Replace all French accents with \x00 to create the "broken" key
# broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word) broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
# if '\x00' in broken_key: if '\x00' in broken_key:
# lookup_map[broken_key] = word # e.g., "\x00cole" -> "école" lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"
# 3. Fast replace function # 3. Fast replace function
def fast_fix(text): def fast_fix(text):
# Find words containing regular letters and null bytes # Find words containing regular letters and null bytes
# def replacer(match): def replacer(match):
# broken_word = match.group(0) broken_word = match.group(0)
# # Return the fixed word from our map, or leave it if not found # Return the fixed word from our map, or leave it if not found
# # (Handles case-insensitivity by falling back to lowercase map) # (Handles case-insensitivity by falling back to lowercase map)
# return lookup_map.get(broken_word.lower(), broken_word) return lookup_map.get(broken_word.lower(), broken_word)
# return re.sub(r'[a-zA-Z\x00]+', replacer, text) return re.sub(r'[a-zA-Z\x00]+', replacer, text)
return text return text