Using folders "Copies" and "Par label", hopefully
parent
7e7045293a
commit
c6c1a052e1
11
Readme.org
11
Readme.org
|
|
@ -1,7 +1,7 @@
|
|||
#+title: Script
|
||||
#+author: Sébastien Miquel
|
||||
#+date: 14-03-2026
|
||||
# Time-stamp: <14-05-26 08:55>
|
||||
# Time-stamp: <17-05-26 10:51>
|
||||
#+OPTIONS:
|
||||
|
||||
* Méta
|
||||
|
|
@ -101,13 +101,13 @@ export GEMINI_API_KEY=…
|
|||
|
||||
Les key bindings ne sont pas adaptés à un clavier azerty… À changer…
|
||||
|
||||
Fix issues with =python page_splitter.py Interro14/Copie01.pdf=
|
||||
Fix issues with =python page_splitter.py Interro14/Copies/Copie01.pdf=
|
||||
4. =python cutleft.py Interro=
|
||||
|
||||
Découpe la partie gauche des copies, là où il devrait y avoir les
|
||||
labels des exercices/questions.
|
||||
|
||||
Rerun on a single file with =python cutleft.py Interro/Copie01.pdf=
|
||||
Rerun on a single file with =python cutleft.py Interro/Copies/Copie01.pdf=
|
||||
|
||||
** Génération d'information sur l'énoncé
|
||||
|
||||
|
|
@ -136,7 +136,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
|
|||
+ `|…` n'est pas arrêté verticalement par son type opposé.
|
||||
+ `…|` est stoppé horizontalement par le `|…` le plus proche.
|
||||
Pour modifier une seule copie :
|
||||
=python plotting.py Interro/Copie01.pdf=
|
||||
=python plotting.py Interro/Copies/Copie01.pdf=
|
||||
|
||||
It also generates les =Copie01.json=, à partir des =Copie01_01.json=
|
||||
1. En cas de soucis, (par exemple les pages ne sont pas dans le bon ordre)
|
||||
|
|
@ -147,6 +147,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
|
|||
3. =python splitting_int.py Interro=
|
||||
|
||||
Découpe les copies suivant les exercices
|
||||
Peut-être appelé avec une seule copie.
|
||||
4. =python grouping.py Interro=
|
||||
|
||||
Regroupe les mêmes questions de différentes copies en groupes de
|
||||
|
|
@ -159,7 +160,7 @@ Set proxy with ~export HTTPS_PROXY="http://10.0.0.1:3128"~
|
|||
1. Il faut créer des persp, pour indication de comment corriger, et
|
||||
relancer =enonce_info.py=
|
||||
2. =python correction.py Interro --limit 240= OU
|
||||
=python correction.py Interro/Ex\ 2/Group_1.jpg= OU
|
||||
=python correction.py Interro/Par\ label/Ex\ 2/Group_1.jpg= OU
|
||||
=python correction.py Interro --overwrite=
|
||||
=python correction.py Interro --pro-by-label= (needs `labels_for_pro`)
|
||||
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]):
|
|||
# Find coordinates
|
||||
coordinates = None
|
||||
height,width= None, None
|
||||
label_dir = os.path.join(root_dir, label)
|
||||
label_dir = Path(root_dir) / "Par label" / label
|
||||
|
||||
# Search all json files in Dir/label
|
||||
json_files = glob.glob(os.path.join(label_dir, "*.json"))
|
||||
|
|
@ -59,7 +59,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]):
|
|||
break
|
||||
|
||||
# Construct PDF path: Dir/Copie{id}/{label}.pdf
|
||||
pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf")
|
||||
pdf_path = Path(root_dir) / "Copies" / f"Copie{student_id}" / f"{label}.pdf"
|
||||
|
||||
# Initialize dictionary structure for this ID if missing
|
||||
if student_id not in result_data:
|
||||
|
|
@ -89,8 +89,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]):
|
|||
# On ajoute des dummies
|
||||
if labels_to_redo: # Si la liste est non vide
|
||||
for lbl in labels_to_redo:
|
||||
pdf_path = os.path.join(root_dir,
|
||||
f"Copie{sid}", f"{lbl}.pdf")
|
||||
pdf_path = Path(root_dir) / "Copies" / f"Copie{sid}" / f"{lbl}.pdf"
|
||||
if not Path(pdf_path).exists():
|
||||
print("Debug : asked to refaire", sid, lbl, "but pdf absent")
|
||||
continue
|
||||
|
|
@ -107,8 +106,7 @@ def make_dictionary(root_dir, refaire=False, refaire_list=[]):
|
|||
else: # Ce student id n'a jamais été corrigé
|
||||
result_data[sid] = {}
|
||||
for lbl in labels_to_redo:
|
||||
pdf_path = os.path.join(root_dir,
|
||||
f"Copie{sid}", f"{lbl}.pdf")
|
||||
pdf_path = Path(root_dir) / "Copies" / f"Copie{sid}" / f"{lbl}.pdf"
|
||||
if not pdf_path.exists():
|
||||
print("Debug : asked to refaire", sid, lbl, "but pdf absent")
|
||||
continue
|
||||
|
|
@ -567,13 +565,13 @@ def process_student(student_id, labels_data, root_dir, all_labels, overwrite):
|
|||
d_notes = dict.fromkeys(all_labels, "")
|
||||
label_images = []
|
||||
|
||||
# !! Trier par l'ordre des labels plutôt
|
||||
sorted_labels = sorted(list(labels_data.items()), key=natural_key)
|
||||
|
||||
for label, content in sorted_labels:
|
||||
# 1. Find PDF path
|
||||
copie_folder = f"Copie{student_id}"
|
||||
pdf_rel_path = os.path.join(copie_folder, f"{label}.pdf")
|
||||
pdf_full_path = os.path.join(root_dir, pdf_rel_path)
|
||||
pdf_full_path = Path(root_dir) / "Copies" / copie_folder / f"{label}.pdf"
|
||||
|
||||
if not os.path.exists(pdf_full_path):
|
||||
print(f"File not found: {pdf_full_path}")
|
||||
|
|
@ -629,13 +627,14 @@ def process_correction(root_dir, data, all_labels, overwrite=False):
|
|||
# # Wait for all threads to complete
|
||||
# concurrent.futures.wait(futures)
|
||||
|
||||
# Ne pas thread cette applications
|
||||
# Ne pas thread cette application
|
||||
# 1. Il faut protéger les appels à matplotlib
|
||||
# 2. tu vas perdre les erreurs
|
||||
for student_id, labels in sorted(data.items()):
|
||||
process_student(student_id, labels, root_dir, all_labels, overwrite)
|
||||
|
||||
import argparse
|
||||
import utils
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Annotate copies")
|
||||
|
|
@ -644,7 +643,7 @@ if __name__ == "__main__":
|
|||
|
||||
args = parser.parse_args()
|
||||
root_dir = args.root_dir
|
||||
labels = list(filter(None, (Path(root_dir) / "labels").read_text().splitlines()))
|
||||
labels = utils.read_all_labels(root_dir)
|
||||
results = make_dictionary(root_dir)
|
||||
# Results is : Copie id -> label -> {pdf_path, gemini_result, coordinates}
|
||||
# Coordinates are the real coordinates (hmin, hmax) of the image in the Group
|
||||
|
|
|
|||
|
|
@ -38,14 +38,20 @@ for path_str in args.paths:
|
|||
# Handle individual file
|
||||
# Note: assumes structure InterroTest/Ex 2/Group_1.jpg to get parents[1]
|
||||
label = arg_path.parent.name
|
||||
INPUT_DIR = arg_path.parent.parent.parent
|
||||
COPIES_DIR = INPUT_DIR / "Copies"
|
||||
GROUPS_DIR = INPUT_DIR / "Par label"
|
||||
tasks.append((str(arg_path), label))
|
||||
if label not in results:
|
||||
results[label] = []
|
||||
|
||||
elif arg_path.is_dir():
|
||||
INPUT_DIR = arg_path
|
||||
COPIES_DIR = INPUT_DIR / "Copies"
|
||||
GROUPS_DIR = INPUT_DIR / "Par label"
|
||||
# Handle directory (original behavior)
|
||||
for sub in arg_path.iterdir():
|
||||
if sub.is_dir() and sub.name.startswith("Ex"):
|
||||
for sub in GROUPS_DIR.iterdir():
|
||||
if sub.is_dir():
|
||||
label = sub.name
|
||||
if label not in results:
|
||||
results[label] = []
|
||||
|
|
@ -145,7 +151,7 @@ do not score or give feedback to any other question."""
|
|||
|
||||
def make_prompt(full_label):
|
||||
def read_longest_prefix_file(subdir):
|
||||
dir_path = Path(INPUT_DIR) / subdir
|
||||
dir_path = INPUT_DIR / subdir
|
||||
matches = [f for f in dir_path.iterdir()
|
||||
if f.is_file()
|
||||
and full_label.startswith(f.name)
|
||||
|
|
@ -167,7 +173,6 @@ from google.genai import types
|
|||
import base64
|
||||
import shlex
|
||||
import json
|
||||
from pathlib import Path
|
||||
import os
|
||||
import threading
|
||||
import concurrent.futures
|
||||
|
|
@ -210,7 +215,7 @@ def flush_thread_log(tid=None):
|
|||
tid = tid or threading.current_thread().name
|
||||
with log_lock:
|
||||
if thread_logs.get(tid):
|
||||
with open(Path(INPUT_DIR) / "correction_log", "a", encoding="utf-8") as f:
|
||||
with open(INPUT_DIR / "correction_log", "a", encoding="utf-8") as f:
|
||||
f.write(f"--- Task Log [{tid}] ---\n")
|
||||
f.write("\n".join(thread_logs[tid]) + "\n\n")
|
||||
thread_logs[tid].clear()
|
||||
|
|
@ -311,8 +316,8 @@ def generate_request(file, full_label):
|
|||
return (contents, generate_content_config)
|
||||
|
||||
client = genai.Client(api_key=api_key)
|
||||
output_path = Path(INPUT_DIR) / "correction.json"
|
||||
progress_path = Path(INPUT_DIR) / "correction_progress.json"
|
||||
output_path = INPUT_DIR / "correction.json"
|
||||
progress_path = INPUT_DIR / "correction_progress.json"
|
||||
start_time = time.time()
|
||||
overwrite = args.overwrite
|
||||
limit = args.limit
|
||||
|
|
@ -407,9 +412,9 @@ def get_single_image_bytes(pdf_path):
|
|||
return img_byte_arr.getvalue()
|
||||
|
||||
def correct_boxes_with_gemini(pid, label, original_feedbacks,
|
||||
root_dir, yming, ymaxg, width_r, total_height):
|
||||
yming, ymaxg, width_r, total_height):
|
||||
"""Requests corrected bounding boxes from Gemini Flash on the single image."""
|
||||
pdf_path = Path(root_dir) / f"Copie{pid}" / f"{label}.pdf"
|
||||
pdf_path = COPIES_DIR / f"Copie{pid}" / f"{label}.pdf"
|
||||
img_bytes = get_single_image_bytes(pdf_path)
|
||||
|
||||
localized_feedbacks = [f for f in original_feedbacks if f["box_2d"]]
|
||||
|
|
@ -473,9 +478,9 @@ it goes wrong, and the feedback is what went wrong.
|
|||
import shutil
|
||||
import grouping
|
||||
|
||||
def get_next_group_idx(root_dir, label):
|
||||
def get_next_group_idx(label):
|
||||
"""Finds the next available Group index for a given label."""
|
||||
target_folder = Path(root_dir) / label
|
||||
target_folder = GROUPS_DIR / label
|
||||
target_folder.mkdir(exist_ok=True)
|
||||
existing = list(target_folder.glob("Group_*.jpg"))
|
||||
if not existing: return 0
|
||||
|
|
@ -489,7 +494,7 @@ def handle_label_errors(pid, label, res, pdf_path):
|
|||
error_type = res.get("error")
|
||||
|
||||
all_labels = read_all_labels(INPUT_DIR)
|
||||
labels_txt = (Path(INPUT_DIR) / "labels").read_text(encoding="utf-8", errors="replace")
|
||||
labels_txt = (INPUT_DIR / "labels").read_text(encoding="utf-8", errors="replace")
|
||||
enonce = enonce_total(INPUT_DIR)
|
||||
|
||||
if error_type == "wrong-label":
|
||||
|
|
@ -523,7 +528,7 @@ Here is a list of all possible labels. You need to answer with one of these :
|
|||
if new_label == label:
|
||||
res["error"] = ""
|
||||
return []
|
||||
new_pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{new_label}.pdf"
|
||||
new_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{new_label}.pdf"
|
||||
if new_pdf_path.exists():
|
||||
tprint(f"\t\tCopie{pid} tried to move wrong {label} to {new_label}, but it already exists.")
|
||||
res["error"] = f"wrg-lbl:{new_label}?exists"
|
||||
|
|
@ -533,12 +538,12 @@ Here is a list of all possible labels. You need to answer with one of these :
|
|||
shutil.move(str(pdf_path), str(new_pdf_path))
|
||||
# Since we moved the file, this Copie/label should not be taken
|
||||
# into account in the future, I think
|
||||
idx = get_next_group_idx(INPUT_DIR, new_label)
|
||||
idx = get_next_group_idx(new_label)
|
||||
height = grouping.get_pdf_height(str(new_pdf_path))
|
||||
grouping.create_jpg(new_label, idx, [(pid, str(new_pdf_path), height)],
|
||||
INPUT_DIR)
|
||||
GROUPS_DIR)
|
||||
tprint(f"\t\tMaking {new_label} group {idx+1}")
|
||||
new_tasks.append((str(Path(INPUT_DIR) / new_label / f"Group_{idx+1}.jpg"),
|
||||
new_tasks.append((str(GROUPS_DIR / new_label / f"Group_{idx+1}.jpg"),
|
||||
new_label, False))
|
||||
|
||||
elif error_type == "additional-answer":
|
||||
|
|
@ -580,15 +585,15 @@ Here is a list of all possible labels. You need to answer with a list one of the
|
|||
error += f"{add_label}??"
|
||||
keep_error = True
|
||||
continue
|
||||
new_pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{add_label}.pdf"
|
||||
new_pdf_path = COPIES_DIR / f"Copie{pid}" / f"{add_label}.pdf"
|
||||
if not new_pdf_path.exists():
|
||||
shutil.copy(str(pdf_path), str(new_pdf_path))
|
||||
tprint(f"\t\tCopying Copie{pid} : {label} -> {add_label}")
|
||||
idx = get_next_group_idx(INPUT_DIR, add_label)
|
||||
idx = get_next_group_idx(add_label)
|
||||
tprint(f"\t\tMaking {add_label} group {idx+1}")
|
||||
height = grouping.get_pdf_height(str(new_pdf_path))
|
||||
grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], INPUT_DIR)
|
||||
new_tasks.append((str(Path(INPUT_DIR) / add_label / f"Group_{idx+1}.jpg"),
|
||||
grouping.create_jpg(add_label, idx, [(pid, str(new_pdf_path), height)], GROUPS_DIR)
|
||||
new_tasks.append((str(GROUPS_DIR / add_label / f"Group_{idx+1}.jpg"),
|
||||
add_label, False))
|
||||
error += f"(->){add_label}"
|
||||
keep_error = True
|
||||
|
|
@ -657,7 +662,7 @@ def process_single_task(task_tuple, precomputed_response=None):
|
|||
res = p["result"]
|
||||
yming, ymaxg, width_r = d_data[pid]
|
||||
|
||||
pdf_path = Path(INPUT_DIR) / f"Copie{pid}" / f"{label}.pdf"
|
||||
pdf_path = COPIES_DIR / f"Copie{pid}" / f"{label}.pdf"
|
||||
if (not can_spawn_tasks) and res["error"] == "additional-answer":
|
||||
tprint("\tSwallowing an additional-answer from a subsequent task.")
|
||||
res["error"]= ""
|
||||
|
|
@ -680,17 +685,22 @@ def process_single_task(task_tuple, precomputed_response=None):
|
|||
pid, label, group_name)
|
||||
continue
|
||||
|
||||
if (ymin < yming - 50 or
|
||||
ymax > ymaxg + 50 or
|
||||
xmax / 1000 > width_r):
|
||||
if (ymin < yming - 50 or ymax > ymaxg + 50 or xmax / 1000 > width_r):
|
||||
needs_correction.append(i)
|
||||
break
|
||||
if ymin < yming - 5:
|
||||
ymin = yming - 5
|
||||
b[0] = ymin * 1000 // total_height
|
||||
if ymax > ymaxg + 5:
|
||||
ymax = ymaxg + 5
|
||||
b[2] = ymax * 1000 // total_height
|
||||
|
||||
|
||||
if needs_correction:
|
||||
tprint(f"\tBox anomalies detected for Copie {pid} {group_name}. \n\tRequesting isolated correction from Gemini Flash...")
|
||||
try:
|
||||
res["feedback"] = correct_boxes_with_gemini(
|
||||
pid, label, res["feedback"], INPUT_DIR,
|
||||
pid, label, res["feedback"],
|
||||
yming, ymaxg, width_r, total_height)
|
||||
except Exception as e:
|
||||
tprint(f"\tCorrection failed for Copie {pid}, {group_name} : {e}\n\tRemoving the boxes")
|
||||
|
|
@ -726,8 +736,8 @@ def process_single_task(task_tuple, precomputed_response=None):
|
|||
|
||||
if __name__ == "__main__":
|
||||
if args.refaire:
|
||||
refaire_path = Path(INPUT_DIR) / "refaire.json"
|
||||
overwritten_path = Path(INPUT_DIR) / "overwritten_correction.json"
|
||||
refaire_path = INPUT_DIR / "refaire.json"
|
||||
overwritten_path = INPUT_DIR / "overwritten_correction.json"
|
||||
|
||||
if refaire_path.exists():
|
||||
with open(refaire_path, "r", encoding="utf-8") as f:
|
||||
|
|
@ -742,7 +752,7 @@ if __name__ == "__main__":
|
|||
|
||||
for copie_name, labels in refaire_list:
|
||||
pid = copie_name.replace("Copie", "")
|
||||
copie_dir = Path(INPUT_DIR) / copie_name
|
||||
copie_dir = COPIES_DIR / copie_name
|
||||
|
||||
# If list is empty, redo all labels available for this Copie
|
||||
if not labels:
|
||||
|
|
@ -772,10 +782,10 @@ if __name__ == "__main__":
|
|||
# 2. Make new group and add to tasks
|
||||
pdf_path = copie_dir / f"{label}.pdf"
|
||||
if pdf_path.exists():
|
||||
idx = get_next_group_idx(INPUT_DIR, label)
|
||||
idx = get_next_group_idx(label)
|
||||
height = grouping.get_pdf_height(str(pdf_path))
|
||||
grouping.create_jpg(label, idx, [(pid, str(pdf_path), height)], INPUT_DIR)
|
||||
new_group_path = str(Path(INPUT_DIR) / label / f"Group_{idx+1}.jpg")
|
||||
grouping.create_jpg(label, idx, [(pid, str(pdf_path), height)], GROUPS_DIR)
|
||||
new_group_path = str(GROUPS_DIR / label / f"Group_{idx+1}.jpg")
|
||||
tasks_to_process.append((new_group_path, label))
|
||||
|
||||
if dirty_results:
|
||||
|
|
@ -813,8 +823,8 @@ if __name__ == "__main__":
|
|||
tasks_to_process = [] # Run nothing live if just `--batch`
|
||||
|
||||
if batch_tasks:
|
||||
batch_flash_file = Path(INPUT_DIR) / "batch_requests_flash.jsonl"
|
||||
batch_pro_file = Path(INPUT_DIR) / "batch_requests_pro.jsonl"
|
||||
batch_flash_file = INPUT_DIR / "batch_requests_flash.jsonl"
|
||||
batch_pro_file = INPUT_DIR / "batch_requests_pro.jsonl"
|
||||
|
||||
count_flash = 0
|
||||
count_pro = 0
|
||||
|
|
@ -873,7 +883,7 @@ if __name__ == "__main__":
|
|||
|
||||
batched_responses = {}
|
||||
if args.deal_with_batched:
|
||||
batch_results_path = Path(INPUT_DIR) / "batched_correction_result.jsonl"
|
||||
batch_results_path = INPUT_DIR / "batched_correction_result.jsonl"
|
||||
if batch_results_path.exists():
|
||||
print(f"Loading batch results from {batch_results_path}...")
|
||||
with open(batch_results_path, "r", encoding="utf-8") as f:
|
||||
|
|
|
|||
28
cutleft.py
28
cutleft.py
|
|
@ -20,17 +20,35 @@ if len(sys.argv) < 2:
|
|||
path_arg = sys.argv[1]
|
||||
files = []
|
||||
INPUT_DIR = ""
|
||||
COPIES_DIR = ""
|
||||
|
||||
if os.path.isfile(path_arg) and path_arg.lower().endswith('.pdf'):
|
||||
INPUT_DIR = os.path.dirname(path_arg)
|
||||
COPIES_DIR = os.path.abspath(os.path.dirname(path_arg))
|
||||
# If the file is inside a "Copies" folder, set INPUT_DIR to the parent
|
||||
if os.path.basename(COPIES_DIR).lower() == 'copies':
|
||||
INPUT_DIR = os.path.dirname(COPIES_DIR)
|
||||
else:
|
||||
INPUT_DIR = COPIES_DIR
|
||||
files = [os.path.basename(path_arg)]
|
||||
elif os.path.isdir(path_arg):
|
||||
INPUT_DIR = path_arg
|
||||
files = sorted([f for f in os.listdir(INPUT_DIR) if f.lower().endswith('.pdf') and
|
||||
"nonc" not in f.lower()])
|
||||
# Support passing either the base dir or the Copies dir directly
|
||||
abs_path = os.path.abspath(path_arg)
|
||||
if os.path.basename(abs_path).lower() == 'copies':
|
||||
COPIES_DIR = abs_path
|
||||
INPUT_DIR = os.path.dirname(abs_path)
|
||||
else:
|
||||
INPUT_DIR = abs_path
|
||||
COPIES_DIR = os.path.join(INPUT_DIR, 'Copies')
|
||||
|
||||
if os.path.exists(COPIES_DIR):
|
||||
files = sorted([f for f in os.listdir(COPIES_DIR) if f.lower().endswith('.pdf') and
|
||||
"nonc" not in f.lower()])
|
||||
else:
|
||||
sys.exit(f"Error: Could not find 'Copies' directory inside {INPUT_DIR}")
|
||||
else:
|
||||
sys.exit("Error: Input must be a directory or a PDF file.")
|
||||
|
||||
|
||||
OUTPUT_DIR = os.path.join(INPUT_DIR, 'Cutleft')
|
||||
|
||||
if not os.path.exists(OUTPUT_DIR):
|
||||
|
|
@ -90,7 +108,7 @@ pdf_cache_lock = threading.Lock()
|
|||
|
||||
@lru_cache(maxsize=3)
|
||||
def _get_pdf_pages_cached(filename):
|
||||
pdf_path = os.path.join(INPUT_DIR, filename)
|
||||
pdf_path = os.path.join(COPIES_DIR, filename)
|
||||
return convert_from_path(pdf_path)
|
||||
|
||||
def get_pdf_pages(filename):
|
||||
|
|
|
|||
|
|
@ -250,7 +250,7 @@ def process_copy_group(group_key, files):
|
|||
for image_file in files:
|
||||
start_time = time.time()
|
||||
base_name = image_file.stem
|
||||
output_json = INPUT_DIR / f"{base_name}.json"
|
||||
output_json = INPUT_DIR / "Copies" / f"{base_name}.json"
|
||||
|
||||
# Check existing
|
||||
if output_json.exists() and not args.overwrite:
|
||||
|
|
|
|||
21
grouping.py
21
grouping.py
|
|
@ -3,6 +3,7 @@ import json
|
|||
import re
|
||||
import sys
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
|
@ -213,9 +214,9 @@ def create_jpg(identifier, group_index, group, root_dir):
|
|||
from utils import natural_key
|
||||
|
||||
|
||||
def process_identifier(identifier, files_info, root_dir):
|
||||
def process_identifier(identifier, files_info, output_dir):
|
||||
# Clear output directory if it exists
|
||||
target_folder = os.path.join(root_dir, identifier)
|
||||
target_folder = os.path.join(output_dir, identifier)
|
||||
if os.path.exists(target_folder):
|
||||
shutil.rmtree(target_folder)
|
||||
os.makedirs(target_folder, exist_ok=True)
|
||||
|
|
@ -224,27 +225,31 @@ def process_identifier(identifier, files_info, root_dir):
|
|||
file_groups = group_files(files_info)
|
||||
|
||||
for idx, group in enumerate(file_groups):
|
||||
create_jpg(identifier, idx, group, root_dir)
|
||||
create_jpg(identifier, idx, group, output_dir)
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python app.py <Path_to_Dir>")
|
||||
sys.exit(1)
|
||||
|
||||
root_dir = sys.argv[1]
|
||||
root_dir = Path(sys.argv[1])
|
||||
|
||||
copies_dir = root_dir / "Copies"
|
||||
par_label_dir = root_dir / "Par label"
|
||||
|
||||
print("Scanning files...")
|
||||
data = collect_files(root_dir)
|
||||
data = collect_files(copies_dir)
|
||||
|
||||
print(f"Found {len(data)} identifiers. Processing...")
|
||||
|
||||
# Sort identifiers naturally
|
||||
sorted_identifiers = sorted(data.keys(), key=natural_key)
|
||||
|
||||
# Process using 4 threads
|
||||
with ThreadPoolExecutor(max_workers=4) as executor:
|
||||
# Process using 8 threads
|
||||
with ThreadPoolExecutor(max_workers=8) as executor:
|
||||
for identifier in sorted_identifiers:
|
||||
executor.submit(process_identifier, identifier, data[identifier], root_dir)
|
||||
executor.submit(process_identifier, identifier, data[identifier],
|
||||
par_label_dir)
|
||||
|
||||
print("Done.")
|
||||
|
||||
|
|
|
|||
|
|
@ -63,6 +63,9 @@ class PDFPreviewer:
|
|||
# Check for existing original in backup and restore if found
|
||||
dir_name = os.path.dirname(os.path.abspath(path))
|
||||
file_name = os.path.basename(path)
|
||||
if os.path.basename(dir_name) == "Copies":
|
||||
dir_name = os.path.dirname(dir_name)
|
||||
path = os.path.join(dir_name, file_name)
|
||||
backup_path = os.path.join(dir_name, "Copies Originales", file_name)
|
||||
|
||||
if os.path.exists(backup_path):
|
||||
|
|
@ -313,9 +316,12 @@ class PDFPreviewer:
|
|||
file_name = os.path.basename(abs_path)
|
||||
|
||||
backup_dir = os.path.join(dir_name, "Copies Originales")
|
||||
copies_dir = os.path.join(dir_name, "Copies")
|
||||
os.makedirs(backup_dir, exist_ok=True)
|
||||
os.makedirs(copies_dir, exist_ok=True)
|
||||
|
||||
backup_path = os.path.join(backup_dir, file_name)
|
||||
copies_path = os.path.join(copies_dir, file_name)
|
||||
|
||||
# Remove backup if it already exists (overwrite)
|
||||
if os.path.exists(backup_path):
|
||||
|
|
@ -325,7 +331,7 @@ class PDFPreviewer:
|
|||
shutil.move(self.pdf_path, backup_path)
|
||||
|
||||
# Move the temp output file to replace the original
|
||||
shutil.move(self.final_file, self.pdf_path)
|
||||
shutil.move(self.final_file, copies_path)
|
||||
|
||||
# print(f"Original moved to {backup_path}, new file saved at {self.pdf_path}")
|
||||
|
||||
|
|
|
|||
31
plotting.py
31
plotting.py
|
|
@ -101,7 +101,7 @@ def worker_thread(base_dir, files_to_process, all_labels):
|
|||
previous_copie = None
|
||||
last_label_index = None
|
||||
for img_path in files_to_process:
|
||||
json_path = base_dir / f"{img_path.stem}.json"
|
||||
json_path = base_dir / "Copies" / f"{img_path.stem}.json"
|
||||
copie_part = int(img_path.stem[-2:])
|
||||
copie = img_path.stem[:-3]
|
||||
if copie != previous_copie:
|
||||
|
|
@ -222,7 +222,7 @@ class ImageViewer:
|
|||
def save_current_batch(self):
|
||||
"""Writes the accumulated data to the main JSON file."""
|
||||
if self.active_copie_name and self.accumulated_results:
|
||||
main_json_path = self.base_dir / f"{self.active_copie_name}.json"
|
||||
main_json_path = self.base_dir / "Copies" / f"{self.active_copie_name}.json"
|
||||
print(f"Writing aggregated result to {main_json_path}")
|
||||
with open(main_json_path, 'w') as f:
|
||||
json.dump(self.accumulated_results, f)
|
||||
|
|
@ -327,7 +327,7 @@ class ImageViewer:
|
|||
def on_open_ori_pdf(self, event):
|
||||
if self.is_viewing and self.current_json_path:
|
||||
new_filename = self.current_json_path.stem.split('_')[0] + ".pdf"
|
||||
pdf_path = self.current_json_path.parent / "Copies Originales" / new_filename
|
||||
pdf_path = self.base_dir / "Copies Originales" / new_filename
|
||||
print(f"Opening {pdf_path}")
|
||||
subprocess.Popen(['xdg-open', str(pdf_path.absolute())])
|
||||
|
||||
|
|
@ -363,20 +363,21 @@ if __name__ == "__main__":
|
|||
files_to_process = []
|
||||
|
||||
if input_path.is_file():
|
||||
# Correctly identify base_dir if we are in 'Copies' or 'Cutleft'
|
||||
if input_path.parent.name in ["Copies", "Cutleft"]:
|
||||
base_dir = input_path.parent.parent
|
||||
else:
|
||||
base_dir = input_path.parent
|
||||
|
||||
base_dir = input_path.parent
|
||||
stem = input_path.stem
|
||||
img_path = base_dir / "Cutleft" / f"{stem}.jpg"
|
||||
files_to_process = [img_path]
|
||||
if not img_path.exists() and input_path.parent.name == "Cutleft":
|
||||
base_dir = input_path.parent.parent
|
||||
img_path = input_path
|
||||
files_to_process = [img_path]
|
||||
if not img_path.exists():
|
||||
# We're given Copie01.pdf, look for parts
|
||||
cutleft_dir = base_dir / "Cutleft"
|
||||
files_to_process = sorted(list(cutleft_dir.glob(f"{img_path.stem}_*.jpg")),
|
||||
key=natural_key)
|
||||
cutleft_dir = base_dir / "Cutleft"
|
||||
img_path = cutleft_dir / f"{stem}.jpg"
|
||||
|
||||
if img_path.exists():
|
||||
files_to_process = [img_path]
|
||||
else:
|
||||
# We're given something like Copie01.pdf, look for its split image parts
|
||||
files_to_process = sorted(list(cutleft_dir.glob(f"{stem}_*.jpg")), key=natural_key)
|
||||
else:
|
||||
base_dir = input_path
|
||||
cutleft_dir = base_dir / "Cutleft"
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import os
|
|||
import json
|
||||
import numpy as np
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from PIL import Image, ImageChops, ImageFilter
|
||||
Image.MAX_IMAGE_PIXELS = None
|
||||
from pdf2image import convert_from_path
|
||||
|
|
@ -99,7 +100,7 @@ def detect_checks_and_notes(output_dir):
|
|||
density = changed_pixels / roi.size
|
||||
|
||||
if density > DENSITY_THRESHOLD:
|
||||
print("A checked box !", density, b)
|
||||
# print("A checked box !", density, b)
|
||||
actions.append(box)
|
||||
# It's checked, so we mask this area out for manual notes
|
||||
# Expand mask slightly to catch sloppy ticks
|
||||
|
|
@ -254,7 +255,7 @@ def apply_actions_and_regenerate(root_dir, data, student_id, actions, notes_laye
|
|||
|
||||
# B. Regenerate Label Image
|
||||
# We always regenerate to ensure Concat.jpg is consistent with any modifications
|
||||
pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf")
|
||||
pdf_path = Path(root_dir) / "Copies" / f"Copie{student_id}" / f"{label}.pdf"
|
||||
if not os.path.exists(pdf_path): continue
|
||||
|
||||
(base_img, _, _) = annotating.make_base_image(pdf_path)
|
||||
|
|
@ -328,7 +329,6 @@ def apply_actions_and_regenerate(root_dir, data, student_id, actions, notes_laye
|
|||
full_img.save(os.path.join(output_dir, "Concat_F.jpg"))
|
||||
print(f" Saved regenerated Concat_F.jpg")
|
||||
|
||||
from pathlib import Path
|
||||
from utils import read_all_labels
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
|
|
|
|||
|
|
@ -85,7 +85,8 @@ def save_paginated_pdf(image_groups, output_path):
|
|||
if pages:
|
||||
pages[0].save(output_path, "PDF", resolution=100.0, save_all=True, append_images=pages[1:])
|
||||
|
||||
def apply_actions_and_regenerate_grouped(root_dir, data, student_id, actions, label_notes, all_labels):
|
||||
def apply_actions_and_regenerate_grouped(root_dir, data, student_id,
|
||||
actions, label_notes, all_labels):
|
||||
"""
|
||||
Modifies data based on actions, pastes label-specific note crops,
|
||||
regenerates label images for consistency, saves dirty ones,
|
||||
|
|
@ -161,7 +162,7 @@ def apply_actions_and_regenerate_grouped(root_dir, data, student_id, actions, la
|
|||
result = content['result']
|
||||
d_notes[label] = str(result.get('score', 0))
|
||||
|
||||
pdf_path = os.path.join(root_dir, f"Copie{student_id}", f"{label}.pdf")
|
||||
pdf_path = Path(root_dir) / "Copies" / f"Copie{student_id}" / f"{label}.pdf"
|
||||
if not os.path.exists(pdf_path): continue
|
||||
|
||||
(base_img, _, _) = annotating.make_base_image(pdf_path)
|
||||
|
|
@ -204,13 +205,15 @@ def apply_actions_and_regenerate_grouped(root_dir, data, student_id, actions, la
|
|||
concat_list.append(final_img)
|
||||
|
||||
perfect_no_comment = True
|
||||
if float(d_notes[label]) != 4.0:
|
||||
if float(d_notes[label]) < 4.0:
|
||||
perfect_no_comment = False
|
||||
else:
|
||||
if len(result.get('feedback', [])) != 0:
|
||||
perfect_no_comment = False
|
||||
lfb = result.get('feedback', [])
|
||||
for e in lfb:
|
||||
if "to_delete" not in e or not e["to_delete"]:
|
||||
perfect_no_comment = False
|
||||
|
||||
if not perfect_no_comment:
|
||||
if not perfect_no_comment or has_notes:
|
||||
extras = get_extra_pdfs_as_images(root_dir, label, annotating)
|
||||
extras.append(final_img)
|
||||
concat_list_F.append(extras)
|
||||
|
|
@ -333,7 +336,8 @@ if __name__ == "__main__":
|
|||
if hmax > hmin:
|
||||
crop = notes_img.crop((0, hmin, notes_img.width, hmax))
|
||||
if has_significant_notes(crop):
|
||||
notes_by_student[sid][lbl] = {'img': crop, 'old_header_h': img_info.get("header_height", 0)}
|
||||
notes_by_student[sid][lbl] = {'img': crop,
|
||||
'old_header_h': img_info.get("header_height", 0)}
|
||||
|
||||
|
||||
def process_refaire_entry(sid, r_labels):
|
||||
|
|
@ -364,7 +368,9 @@ if __name__ == "__main__":
|
|||
if hmax > hmin:
|
||||
crop = b_notes_img.crop((0, hmin, b_notes_img.width, hmax))
|
||||
if has_significant_notes(crop):
|
||||
notes_by_student[sid][lbl] = {'img': crop, 'old_header_h': img_info.get("header_height", 0)}
|
||||
notes_by_student[sid][lbl] = \
|
||||
{'img': crop,
|
||||
'old_header_h': img_info.get("header_height", 0)}
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ def decode_json(pdf_file):
|
|||
def split_an_interro(base_dir, input_pdf, coords_list):
|
||||
doc = fitz.open(input_pdf)
|
||||
|
||||
output_dir = base_dir / input_pdf.stem
|
||||
output_dir = base_dir / "Copies" / input_pdf.stem
|
||||
generated_files = set()
|
||||
parts_by_label = defaultdict(list)
|
||||
|
||||
|
|
@ -197,10 +197,13 @@ if __name__ == "__main__":
|
|||
|
||||
if input_arg.is_file():
|
||||
base_dir = input_arg.parent
|
||||
if base_dir.name == "Copies":
|
||||
base_dir = base_dir.parent
|
||||
pdf_files = [input_arg]
|
||||
elif input_arg.is_dir():
|
||||
base_dir = input_arg
|
||||
pdf_files = sorted(base_dir.glob("*.pdf"))
|
||||
copies_dir = base_dir / "Copies"
|
||||
pdf_files = sorted(copies_dir.glob("*.pdf"))
|
||||
else:
|
||||
print(f"Error: {input_arg} is not a valid file or directory.")
|
||||
sys.exit(1)
|
||||
|
|
|
|||
4
utils.py
4
utils.py
|
|
@ -5,9 +5,7 @@ def natural_key(text):
|
|||
return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]
|
||||
|
||||
def read_all_labels(base_dir):
|
||||
return sorted(list(filter(None,
|
||||
(Path(base_dir) / "labels").read_text().splitlines())),
|
||||
key = natural_key)
|
||||
return list(filter(None, (Path(base_dir) / "labels").read_text().splitlines()))
|
||||
|
||||
def enonce_total(base_dir):
|
||||
text_dir = Path(base_dir) / 'Text'
|
||||
|
|
|
|||
Loading…
Reference in New Issue