master
Sébastien Miquel 2026-04-14 09:16:17 +02:00
parent 822a531679
commit 882c9b64ba
5 changed files with 21106 additions and 39 deletions

View File

@ -623,25 +623,6 @@ def process_single_task(task_tuple):
needs_correction.append(i)
break
#
# if ymin < yming-50 or ymax > ymaxg+50:
# print("Error : Gemini answered box2d too low/up", pid, label, group_name)
# if ymax < yming or ymin > ymaxg:
# print("Removing the box.")
# f["box_2d"] = None
# continue
# nymin = max(ymin, yming) * 1000 // total_height
# nymax = min(ymax, ymaxg) * 1000 // total_height
# f["box_2d"] = [nymin, xmin, nymax, xmax]
# if f["box_2d"] and xmax / 1000 > width_r:
# print("Error : Gemini answered box2d too right", pid, label, group_name)
# if xmin / 1000 > width_r:
# print("Removing the box.")
# f["box_2d"] = None
# continue
# f["box_2d"][3] = int(width_r * 1000)
if needs_correction:
tprint(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...")
try:

View File

@ -211,7 +211,7 @@ for path_str in args.input_paths:
labels_txt = (INPUT_DIR / "labels").read_text()
valid_labels_set = set(line.strip() for line in labels_txt.splitlines() if line.strip())
names_path = (INPUT_DIR / "names")
if !os.path.exists(names_path):
if not os.path.exists(names_path):
names_path = Path("names")
names_txt = names_path.read_text()

21074
liste_francais.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -9,7 +9,7 @@ from tkinter import messagebox
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont, ImageTk
print("o to open pdf, O original pdf, e to emacs part, click for coordinates")
print("o to open pdf, O original pdf, e to emacs part, i to interro, click for coordinates")
# --- Configuration & Globals ---
padding = 60
@ -54,7 +54,7 @@ def convert_list(l, group_id, json_schema):
ll.append(ee)
return ll
def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages):
def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages, last_label_index):
im = Image.open(image_path)
im.load()
width, height = im.size
@ -62,7 +62,6 @@ def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages):
new_im.paste(im, (0, 0))
draw = ImageDraw.Draw(new_im)
bounding_boxes.sort(key=lambda b: (page_number(b["box_2d"], nb_pages), b["box_2d"][0]))
last_label_index = -1
for bbox in bounding_boxes:
raw_y_min = int(bbox["box_2d"][0] * height / 1000)
@ -88,7 +87,7 @@ def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages):
draw.text((abs_x_min + 8, abs_y_min - 30), label, fill=color, font=font)
else:
draw.text((abs_x_min + 8, abs_y_max + 6), label, fill=color, font=font)
return new_im
return (new_im, last_label_index)
# --- Processing Logic (Worker Thread) ---
@ -97,10 +96,15 @@ def worker_thread(base_dir, files_to_process, all_labels):
Iterates through files, prepares VISUALS only, and puts metadata in queue.
Does NOT write final JSON files anymore.
"""
previous_copie = None
last_label_index = None
for img_path in files_to_process:
json_path = base_dir / f"{img_path.stem}.json"
copie_part = int(img_path.stem[-2:])
copie = img_path.stem[:-3]
if copie != previous_copie:
last_label_index = -1
previous_copie = copie
json_schema_path = base_dir / 'Cutleft' / f"{copie}_schema.json"
try:
@ -127,7 +131,8 @@ def worker_thread(base_dir, files_to_process, all_labels):
try:
print(f"Buffering {img_path.name}...")
pil_image = prepare_image(str(img_path), bb_list, all_labels, nb_pages)
(pil_image, last_label_index) = \
prepare_image(str(img_path), bb_list, all_labels, nb_pages, last_label_index)
metadata = {
"copie": copie,
@ -169,6 +174,7 @@ class ImageViewer:
self.root.bind('<Return>', self.on_enter)
self.root.bind('e', self.on_edit)
self.root.bind('o', self.on_open_pdf)
self.root.bind('i', self.on_open_interro)
self.root.bind('O', self.on_open_ori_pdf)
self.root.bind('<Escape>', lambda e: self.root.quit())
self.label.bind('<Button-1>', self.on_click)
@ -265,6 +271,12 @@ class ImageViewer:
subprocess.Popen(['xdg-open', str(pdf_path.absolute())])
def on_open_ori_pdf(self, event):
if self.is_viewing and self.current_json_path:
pdf_path = "/home/sebastien/Staging/Interro/" + base_dir + "pdf"
print(f"Opening {pdf_path}")
subprocess.Popen(['xdg-open', pdf_path])
def on_open_interro(self, event):
if self.is_viewing and self.current_json_path:
new_filename = self.current_json_path.stem.split('_')[0] + ".pdf"
pdf_path = self.current_json_path.parent / "Copies Originales" / new_filename

View File

@ -20,28 +20,28 @@ import ftfy
import re
import urllib.request
# url = "https://raw.githubusercontent.com/hbenbel/French-Dictionary/master/dictionary/dictionary.txt"
# french_words = urllib.request.urlopen(url).read().decode('utf-8').splitlines()
with open('liste_francais.txt', 'r') as f:
french_words = f.read().splitlines()
# 2. Pre-compute an O(1) lookup dictionary
# We simulate the corruption by replacing accents with null bytes (\x00)
# lookup_map = {}
# for word in french_words:
# # Replace all French accents with \x00 to create the "broken" key
# broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
# if '\x00' in broken_key:
# lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"
lookup_map = {}
for word in french_words:
# Replace all French accents with \x00 to create the "broken" key
broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
if '\x00' in broken_key:
lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"
# 3. Fast replace function
def fast_fix(text):
# Find words containing regular letters and null bytes
# def replacer(match):
# broken_word = match.group(0)
# # Return the fixed word from our map, or leave it if not found
# # (Handles case-insensitivity by falling back to lowercase map)
# return lookup_map.get(broken_word.lower(), broken_word)
def replacer(match):
broken_word = match.group(0)
# Return the fixed word from our map, or leave it if not found
# (Handles case-insensitivity by falling back to lowercase map)
return lookup_map.get(broken_word.lower(), broken_word)
# return re.sub(r'[a-zA-Z\x00]+', replacer, text)
return re.sub(r'[a-zA-Z\x00]+', replacer, text)
return text