Miscs
parent
822a531679
commit
882c9b64ba
|
|
@ -623,25 +623,6 @@ def process_single_task(task_tuple):
|
||||||
needs_correction.append(i)
|
needs_correction.append(i)
|
||||||
break
|
break
|
||||||
|
|
||||||
#
|
|
||||||
# if ymin < yming-50 or ymax > ymaxg+50:
|
|
||||||
# print("Error : Gemini answered box2d too low/up", pid, label, group_name)
|
|
||||||
# if ymax < yming or ymin > ymaxg:
|
|
||||||
# print("Removing the box.")
|
|
||||||
# f["box_2d"] = None
|
|
||||||
# continue
|
|
||||||
# nymin = max(ymin, yming) * 1000 // total_height
|
|
||||||
# nymax = min(ymax, ymaxg) * 1000 // total_height
|
|
||||||
# f["box_2d"] = [nymin, xmin, nymax, xmax]
|
|
||||||
|
|
||||||
# if f["box_2d"] and xmax / 1000 > width_r:
|
|
||||||
# print("Error : Gemini answered box2d too right", pid, label, group_name)
|
|
||||||
# if xmin / 1000 > width_r:
|
|
||||||
# print("Removing the box.")
|
|
||||||
# f["box_2d"] = None
|
|
||||||
# continue
|
|
||||||
# f["box_2d"][3] = int(width_r * 1000)
|
|
||||||
|
|
||||||
if needs_correction:
|
if needs_correction:
|
||||||
tprint(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...")
|
tprint(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...")
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -211,7 +211,7 @@ for path_str in args.input_paths:
|
||||||
labels_txt = (INPUT_DIR / "labels").read_text()
|
labels_txt = (INPUT_DIR / "labels").read_text()
|
||||||
valid_labels_set = set(line.strip() for line in labels_txt.splitlines() if line.strip())
|
valid_labels_set = set(line.strip() for line in labels_txt.splitlines() if line.strip())
|
||||||
names_path = (INPUT_DIR / "names")
|
names_path = (INPUT_DIR / "names")
|
||||||
if !os.path.exists(names_path):
|
if not os.path.exists(names_path):
|
||||||
names_path = Path("names")
|
names_path = Path("names")
|
||||||
names_txt = names_path.read_text()
|
names_txt = names_path.read_text()
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
22
plotting.py
22
plotting.py
|
|
@ -9,7 +9,7 @@ from tkinter import messagebox
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from PIL import Image, ImageDraw, ImageFont, ImageTk
|
from PIL import Image, ImageDraw, ImageFont, ImageTk
|
||||||
|
|
||||||
print("o to open pdf, O original pdf, e to emacs part, click for coordinates")
|
print("o to open pdf, O original pdf, e to emacs part, i to interro, click for coordinates")
|
||||||
|
|
||||||
# --- Configuration & Globals ---
|
# --- Configuration & Globals ---
|
||||||
padding = 60
|
padding = 60
|
||||||
|
|
@ -54,7 +54,7 @@ def convert_list(l, group_id, json_schema):
|
||||||
ll.append(ee)
|
ll.append(ee)
|
||||||
return ll
|
return ll
|
||||||
|
|
||||||
def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages):
|
def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages, last_label_index):
|
||||||
im = Image.open(image_path)
|
im = Image.open(image_path)
|
||||||
im.load()
|
im.load()
|
||||||
width, height = im.size
|
width, height = im.size
|
||||||
|
|
@ -62,7 +62,6 @@ def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages):
|
||||||
new_im.paste(im, (0, 0))
|
new_im.paste(im, (0, 0))
|
||||||
draw = ImageDraw.Draw(new_im)
|
draw = ImageDraw.Draw(new_im)
|
||||||
bounding_boxes.sort(key=lambda b: (page_number(b["box_2d"], nb_pages), b["box_2d"][0]))
|
bounding_boxes.sort(key=lambda b: (page_number(b["box_2d"], nb_pages), b["box_2d"][0]))
|
||||||
last_label_index = -1
|
|
||||||
|
|
||||||
for bbox in bounding_boxes:
|
for bbox in bounding_boxes:
|
||||||
raw_y_min = int(bbox["box_2d"][0] * height / 1000)
|
raw_y_min = int(bbox["box_2d"][0] * height / 1000)
|
||||||
|
|
@ -88,7 +87,7 @@ def prepare_image(image_path: str, bounding_boxes, all_labels, nb_pages):
|
||||||
draw.text((abs_x_min + 8, abs_y_min - 30), label, fill=color, font=font)
|
draw.text((abs_x_min + 8, abs_y_min - 30), label, fill=color, font=font)
|
||||||
else:
|
else:
|
||||||
draw.text((abs_x_min + 8, abs_y_max + 6), label, fill=color, font=font)
|
draw.text((abs_x_min + 8, abs_y_max + 6), label, fill=color, font=font)
|
||||||
return new_im
|
return (new_im, last_label_index)
|
||||||
|
|
||||||
# --- Processing Logic (Worker Thread) ---
|
# --- Processing Logic (Worker Thread) ---
|
||||||
|
|
||||||
|
|
@ -97,10 +96,15 @@ def worker_thread(base_dir, files_to_process, all_labels):
|
||||||
Iterates through files, prepares VISUALS only, and puts metadata in queue.
|
Iterates through files, prepares VISUALS only, and puts metadata in queue.
|
||||||
Does NOT write final JSON files anymore.
|
Does NOT write final JSON files anymore.
|
||||||
"""
|
"""
|
||||||
|
previous_copie = None
|
||||||
|
last_label_index = None
|
||||||
for img_path in files_to_process:
|
for img_path in files_to_process:
|
||||||
json_path = base_dir / f"{img_path.stem}.json"
|
json_path = base_dir / f"{img_path.stem}.json"
|
||||||
copie_part = int(img_path.stem[-2:])
|
copie_part = int(img_path.stem[-2:])
|
||||||
copie = img_path.stem[:-3]
|
copie = img_path.stem[:-3]
|
||||||
|
if copie != previous_copie:
|
||||||
|
last_label_index = -1
|
||||||
|
previous_copie = copie
|
||||||
json_schema_path = base_dir / 'Cutleft' / f"{copie}_schema.json"
|
json_schema_path = base_dir / 'Cutleft' / f"{copie}_schema.json"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -127,7 +131,8 @@ def worker_thread(base_dir, files_to_process, all_labels):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print(f"Buffering {img_path.name}...")
|
print(f"Buffering {img_path.name}...")
|
||||||
pil_image = prepare_image(str(img_path), bb_list, all_labels, nb_pages)
|
(pil_image, last_label_index) = \
|
||||||
|
prepare_image(str(img_path), bb_list, all_labels, nb_pages, last_label_index)
|
||||||
|
|
||||||
metadata = {
|
metadata = {
|
||||||
"copie": copie,
|
"copie": copie,
|
||||||
|
|
@ -169,6 +174,7 @@ class ImageViewer:
|
||||||
self.root.bind('<Return>', self.on_enter)
|
self.root.bind('<Return>', self.on_enter)
|
||||||
self.root.bind('e', self.on_edit)
|
self.root.bind('e', self.on_edit)
|
||||||
self.root.bind('o', self.on_open_pdf)
|
self.root.bind('o', self.on_open_pdf)
|
||||||
|
self.root.bind('i', self.on_open_interro)
|
||||||
self.root.bind('O', self.on_open_ori_pdf)
|
self.root.bind('O', self.on_open_ori_pdf)
|
||||||
self.root.bind('<Escape>', lambda e: self.root.quit())
|
self.root.bind('<Escape>', lambda e: self.root.quit())
|
||||||
self.label.bind('<Button-1>', self.on_click)
|
self.label.bind('<Button-1>', self.on_click)
|
||||||
|
|
@ -265,6 +271,12 @@ class ImageViewer:
|
||||||
subprocess.Popen(['xdg-open', str(pdf_path.absolute())])
|
subprocess.Popen(['xdg-open', str(pdf_path.absolute())])
|
||||||
|
|
||||||
def on_open_ori_pdf(self, event):
|
def on_open_ori_pdf(self, event):
|
||||||
|
if self.is_viewing and self.current_json_path:
|
||||||
|
pdf_path = "/home/sebastien/Staging/Interro/" + base_dir + "pdf"
|
||||||
|
print(f"Opening {pdf_path}")
|
||||||
|
subprocess.Popen(['xdg-open', pdf_path])
|
||||||
|
|
||||||
|
def on_open_interro(self, event):
|
||||||
if self.is_viewing and self.current_json_path:
|
if self.is_viewing and self.current_json_path:
|
||||||
new_filename = self.current_json_path.stem.split('_')[0] + ".pdf"
|
new_filename = self.current_json_path.stem.split('_')[0] + ".pdf"
|
||||||
pdf_path = self.current_json_path.parent / "Copies Originales" / new_filename
|
pdf_path = self.current_json_path.parent / "Copies Originales" / new_filename
|
||||||
|
|
|
||||||
|
|
@ -20,28 +20,28 @@ import ftfy
|
||||||
import re
|
import re
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
||||||
# url = "https://raw.githubusercontent.com/hbenbel/French-Dictionary/master/dictionary/dictionary.txt"
|
with open('liste_francais.txt', 'r') as f:
|
||||||
# french_words = urllib.request.urlopen(url).read().decode('utf-8').splitlines()
|
french_words = f.read().splitlines()
|
||||||
|
|
||||||
# 2. Pre-compute an O(1) lookup dictionary
|
# 2. Pre-compute an O(1) lookup dictionary
|
||||||
# We simulate the corruption by replacing accents with null bytes (\x00)
|
# We simulate the corruption by replacing accents with null bytes (\x00)
|
||||||
# lookup_map = {}
|
lookup_map = {}
|
||||||
# for word in french_words:
|
for word in french_words:
|
||||||
# # Replace all French accents with \x00 to create the "broken" key
|
# Replace all French accents with \x00 to create the "broken" key
|
||||||
# broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
|
broken_key = re.sub(r'[éèêëàâäîïôöùûüçœÉÈÊËÀÂÄÎÏÔÖÙÛÜÇŒ]', '\x00', word)
|
||||||
# if '\x00' in broken_key:
|
if '\x00' in broken_key:
|
||||||
# lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"
|
lookup_map[broken_key] = word # e.g., "\x00cole" -> "école"
|
||||||
|
|
||||||
# 3. Fast replace function
|
# 3. Fast replace function
|
||||||
def fast_fix(text):
|
def fast_fix(text):
|
||||||
# Find words containing regular letters and null bytes
|
# Find words containing regular letters and null bytes
|
||||||
# def replacer(match):
|
def replacer(match):
|
||||||
# broken_word = match.group(0)
|
broken_word = match.group(0)
|
||||||
# # Return the fixed word from our map, or leave it if not found
|
# Return the fixed word from our map, or leave it if not found
|
||||||
# # (Handles case-insensitivity by falling back to lowercase map)
|
# (Handles case-insensitivity by falling back to lowercase map)
|
||||||
# return lookup_map.get(broken_word.lower(), broken_word)
|
return lookup_map.get(broken_word.lower(), broken_word)
|
||||||
|
|
||||||
# return re.sub(r'[a-zA-Z\x00]+', replacer, text)
|
return re.sub(r'[a-zA-Z\x00]+', replacer, text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue