Logging suport : in correction_log

master
Sébastien Miquel 2026-03-28 10:28:03 +01:00
parent 173e77a64a
commit bd35e69534
1 changed files with 167 additions and 124 deletions

View File

@ -165,6 +165,47 @@ MODEL_ID_pro = "gemini-3.1-pro-preview"
MODEL_ID_flash = "gemini-3-flash-preview" MODEL_ID_flash = "gemini-3-flash-preview"
api_key = os.environ["GEMINI_API_KEY"] api_key = os.environ["GEMINI_API_KEY"]
import signal
import sys
# --- Thread-safe Logging ---
log_lock = threading.Lock()
thread_logs = {}
def tprint(*args, **kwargs):
"""Buffer messages per thread to group them."""
tid = threading.current_thread().name
msg = " ".join(map(str, args))
with log_lock:
if tid not in thread_logs:
thread_logs[tid] = []
thread_logs[tid].append(msg)
Optional: Keep printing to console but prefix with thread name
print(f"[{tid}] {msg}", **kwargs)
def flush_thread_log(tid=None):
"""Append a thread's buffered messages to the log file contiguously."""
tid = tid or threading.current_thread().name
with log_lock:
if thread_logs.get(tid):
with open("correction_log", "a", encoding="utf-8") as f:
f.write(f"--- Task Log [{tid}] ---\n")
f.write("\n".join(thread_logs[tid]) + "\n\n")
thread_logs[tid].clear()
def handle_interrupt(sig, frame):
"""Flush all partial/unfinished logs if program is interrupted."""
print("\nInterrupt received. Flushing partial logs...", file=sys.stderr)
for tid in list(thread_logs.keys()):
flush_thread_log(tid)
sys.exit(1)
signal.signal(signal.SIGINT, handle_interrupt)
signal.signal(signal.SIGTERM, handle_interrupt)
# ---------------------------
from pydantic import BaseModel, Field, TypeAdapter from pydantic import BaseModel, Field, TypeAdapter
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
@ -183,7 +224,6 @@ class EvaluationEntry(BaseModel):
result: ResultData = Field(description="Result details") result: ResultData = Field(description="Result details")
# The root model for parsing is be: List[EvaluationEntry] # The root model for parsing is be: List[EvaluationEntry]
def generate_request(file, full_label): def generate_request(file, full_label):
"""Generates request for Gemini.""" """Generates request for Gemini."""
prompt = make_prompt(full_label) prompt = make_prompt(full_label)
@ -271,16 +311,16 @@ def call_gemini_with_retries(model_id, contents, config,
# Immediately fallback to Flash without waiting if it's a Pro quota error # Immediately fallback to Flash without waiting if it's a Pro quota error
if is_quota_error and model_id == MODEL_ID_pro and fallback_model_id: if is_quota_error and model_id == MODEL_ID_pro and fallback_model_id:
print(f"\tGemini Pro quota hit ({e}). Falling back to Flash permanently...") tprint(f"\tGemini Pro quota hit ({e}). Falling back to Flash permanently...")
model_id = fallback_model_id model_id = fallback_model_id
pro_quota_exhausted = True pro_quota_exhausted = True
continue # Retry immediately with Flash continue # Retry immediately with Flash
if attempt < 2: if attempt < 2:
print(f"\tGemini API failure: {e}. Retrying in {delays[attempt]} seconds...") tprint(f"\tGemini API failure: {e}. Retrying in {delays[attempt]} seconds...")
time.sleep(delays[attempt]) time.sleep(delays[attempt])
else: else:
print(f"\tGemini API failure: {e}. Maximum retries reached.") tprint(f"\tGemini API failure: {e}. Maximum retries reached.")
raise raise
import io import io
@ -395,7 +435,7 @@ def handle_label_errors(pid, label, res, pdf_path):
enonce = enonce_total(INPUT_DIR) enonce = enonce_total(INPUT_DIR)
if error_type == "wrong-label": if error_type == "wrong-label":
print(f"\tHandling wrong-label for {pid} {label}") tprint(f"\tHandling wrong-label for {pid} {label}")
prompt = f"""This image is a part of the answer of a student to a written exam. prompt = f"""This image is a part of the answer of a student to a written exam.
It was initially labeled '{label}' but I suspect this label is wrong. Perhaps the student himself wrote the wrong label. It was initially labeled '{label}' but I suspect this label is wrong. Perhaps the student himself wrote the wrong label.
@ -419,7 +459,7 @@ Here is a list of all possible lables. You need to answer with one of these :
config = types.GenerateContentConfig(temperature=0.0) config = types.GenerateContentConfig(temperature=0.0)
new_label = call_gemini_with_retries(MODEL_ID_flash, contents, config).strip().strip('"\'') new_label = call_gemini_with_retries(MODEL_ID_flash, contents, config).strip().strip('"\'')
if new_label not in all_labels: if new_label not in all_labels:
print(f"\t\tCopie{pid} returned an incorrect label {new_label} from an initial wrong label {label}. Ignoring") tprint(f"\t\tCopie{pid} returned an incorrect label {new_label} from an initial wrong label {label}. Ignoring")
res["error"] = "wrg-lbl:cldtfix" res["error"] = "wrg-lbl:cldtfix"
return [] return []
if new_label == label: if new_label == label:
@ -427,17 +467,17 @@ Here is a list of all possible lables. You need to answer with one of these :
return [] return []
new_pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{new_label}.pdf" new_pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{new_label}.pdf"
if new_pdf_path.exists(): if new_pdf_path.exists():
print(f"\t\tCopie{pid} tried to move wrong {label} to {new_label}, but it already exists.") tprint(f"\t\tCopie{pid} tried to move wrong {label} to {new_label}, but it already exists.")
res["error"] = f"wrg-lbl:{new_label}?exists" res["error"] = f"wrg-lbl:{new_label}?exists"
else: else:
print(f"\t\tCopie{pid} : moving wrong {label} to {new_label}.") tprint(f"\t\tCopie{pid} : moving wrong {label} to {new_label}.")
shutil.move(str(pdf_path), str(new_pdf_path)) shutil.move(str(pdf_path), str(new_pdf_path))
# Since we moved the file, this Copie/label should not be taken # Since we moved the file, this Copie/label should not be taken
# into account in the future, I think # into account in the future, I think
idx = get_next_group_idx(INPUT_DIR, new_label) idx = get_next_group_idx(INPUT_DIR, new_label)
height = grouping.get_pdf_height(str(new_pdf_path)) height = grouping.get_pdf_height(str(new_pdf_path))
grouping.create_jpg(new_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR) grouping.create_jpg(new_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR)
print(f"\t\tMaking {new_label} group {idx+1}") tprint(f"\t\tMaking {new_label} group {idx+1}")
new_tasks.append((str(Path(INPUT_DIR) / new_label / f"Group_{idx+1}.jpg"), new_tasks.append((str(Path(INPUT_DIR) / new_label / f"Group_{idx+1}.jpg"),
new_label, False)) new_label, False))
@ -458,7 +498,7 @@ Here is a list of all possible labels. You need to answer with a list one of the
{labels_txt} {labels_txt}
""" """
print(f"\tHandling additional-answer for {pid} {label}") tprint(f"\tHandling additional-answer for {pid} {label}")
contents = [types.Content(role="user", parts=[ contents = [types.Content(role="user", parts=[
types.Part.from_bytes(data=get_single_image_bytes(pdf_path), mime_type="image/jpeg"), types.Part.from_bytes(data=get_single_image_bytes(pdf_path), mime_type="image/jpeg"),
types.Part.from_text(text=prompt) types.Part.from_text(text=prompt)
@ -469,28 +509,28 @@ Here is a list of all possible labels. You need to answer with a list one of the
except Exception: except Exception:
add_labels = [] add_labels = []
print(f"\tHandling additional-answer for {pid} {label}") tprint(f"\tHandling additional-answer for {pid} {label}")
keep_error = False keep_error = False
for add_label in add_labels: for add_label in add_labels:
if add_label == label: if add_label == label:
continue continue
if add_label not in all_labels: if add_label not in all_labels:
print(f"\t\t Inexistent label from additional-answer processing {pid} {label}. Ignoring") tprint(f"\t\t Inexistent label from additional-answer processing {pid} {label}. Ignoring")
keep_error = True keep_error = True
continue continue
new_pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{add_label}.pdf" new_pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{add_label}.pdf"
if not new_pdf_path.exists(): if not new_pdf_path.exists():
shutil.copy(str(pdf_path), str(new_pdf_path)) shutil.copy(str(pdf_path), str(new_pdf_path))
print(f"\t\tCopying Copie{pid} : {label} -> {add_label}") tprint(f"\t\tCopying Copie{pid} : {label} -> {add_label}")
idx = get_next_group_idx(INPUT_DIR, add_label) idx = get_next_group_idx(INPUT_DIR, add_label)
print(f"\t\tMaking {add_label} group {idx+1}") tprint(f"\t\tMaking {add_label} group {idx+1}")
height = grouping.get_pdf_height(str(new_pdf_path)) height = grouping.get_pdf_height(str(new_pdf_path))
grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR) grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR)
new_tasks.append((str(Path(INPUT_DIR) / add_label / f"Group_{idx+1}.jpg"), new_tasks.append((str(Path(INPUT_DIR) / add_label / f"Group_{idx+1}.jpg"),
add_label, False)) add_label, False))
else: else:
keep_error = True keep_error = True
print(f"\t\tAlready present (not copied) Copie{pid} : {label} -> {add_label}") tprint(f"\t\tAlready present (not copied) Copie{pid} : {label} -> {add_label}")
if not keep_error: if not keep_error:
@ -499,6 +539,7 @@ Here is a list of all possible labels. You need to answer with a list one of the
return new_tasks return new_tasks
def process_single_task(task_tuple): def process_single_task(task_tuple):
try:
global pro_count, flash_count, pro_quota_exhausted global pro_count, flash_count, pro_quota_exhausted
file_path = task_tuple[0] file_path = task_tuple[0]
label = task_tuple[1] label = task_tuple[1]
@ -532,7 +573,7 @@ def process_single_task(task_tuple):
try: try:
contents, config = generate_request(file_path, label) contents, config = generate_request(file_path, label)
model_to_use = MODEL_ID_flash if use_flash else MODEL_ID_pro model_to_use = MODEL_ID_flash if use_flash else MODEL_ID_pro
print(f"Asking Gemini {'Flash' if use_flash else 'Pro '}: {label} {group_name}") tprint(f"Asking Gemini {'Flash' if use_flash else 'Pro '}: {label} {group_name}")
full_response_text = call_gemini_with_retries(model_to_use, contents, config) full_response_text = call_gemini_with_retries(model_to_use, contents, config)
json_data = json.loads(full_response_text) json_data = json.loads(full_response_text)
@ -545,7 +586,7 @@ def process_single_task(task_tuple):
pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{label}.pdf" pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{label}.pdf"
if res["error"] != "": if res["error"] != "":
print("\tError :", res["error"], "for Copie", pid, group_name) tprint("\tError :", res["error"], "for Copie", pid, group_name)
if can_spawn_tasks and res.get("error") in ["wrong-label", "additional-answer"]: if can_spawn_tasks and res.get("error") in ["wrong-label", "additional-answer"]:
new_tasks.extend(handle_label_errors(pid, label, res, pdf_path)) new_tasks.extend(handle_label_errors(pid, label, res, pdf_path))
@ -559,7 +600,7 @@ def process_single_task(task_tuple):
ymax = ymax * total_height // 1000 ymax = ymax * total_height // 1000
if pid not in d_data: if pid not in d_data:
print("Error : Gemini answered a copie id not present", tprint("Error : Gemini answered a copie id not present",
pid, label, group_name) pid, label, group_name)
continue continue
@ -589,13 +630,13 @@ def process_single_task(task_tuple):
# f["box_2d"][3] = int(width_r * 1000) # f["box_2d"][3] = int(width_r * 1000)
if needs_correction: if needs_correction:
print(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...") tprint(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...")
try: try:
res["feedback"] = correct_boxes_with_gemini( res["feedback"] = correct_boxes_with_gemini(
pid, label, res["feedback"], INPUT_DIR, pid, label, res["feedback"], INPUT_DIR,
yming, ymaxg, width_r, total_height) yming, ymaxg, width_r, total_height)
except Exception as e: except Exception as e:
print(f"\tCorrection failed for Copie {pid}, {group_name} : {e}\n\tRemoving the boxes") tprint(f"\tCorrection failed for Copie {pid}, {group_name} : {e}\n\tRemoving the boxes")
# Fallback if the second request fails entirely # Fallback if the second request fails entirely
for (i, f) in enumerate(res["feedback"]): for (i, f) in enumerate(res["feedback"]):
if i in needs_correction: if i in needs_correction:
@ -616,13 +657,15 @@ def process_single_task(task_tuple):
json.dump(completed_tasks, f, indent=2) json.dump(completed_tasks, f, indent=2)
except json.JSONDecodeError: except json.JSONDecodeError:
print(f"Error decoding JSON for {file_path}", file=sys.stderr) tprint(f"Error decoding JSON for {file_path}", file=sys.stderr)
except Exception as e: except Exception as e:
error_msg = f"Exception processing {file_path}: {e}" error_msg = f"Exception processing {file_path}: {e}"
print(error_msg, file=sys.stderr) print(error_msg, file=sys.stderr)
with io_lock: with io_lock:
errors_summary.append((error_msg, file_path)) errors_summary.append((error_msg, file_path))
return new_tasks return new_tasks
finally:
flush_thread_log()
if __name__ == "__main__": if __name__ == "__main__":
print(f"Starting processing on {len(tasks_to_process)} tasks with {NB_THREADS} threads...") print(f"Starting processing on {len(tasks_to_process)} tasks with {NB_THREADS} threads...")