From 4a77f0905b4538e88c6aadf5b6179d73c9328ace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Miquel?= Date: Sat, 25 Apr 2026 09:18:32 +0200 Subject: [PATCH] Script.org file and batching utilities --- Script.org | 136 +++++++++++++++++++++++++++++++++++++++ batch_status.py | 88 +++++++++++++++++++++++++ fetch_batched_results.py | 63 ++++++++++++++++++ submit_batches.py | 79 +++++++++++++++++++++++ 4 files changed, 366 insertions(+) create mode 100644 Script.org create mode 100644 batch_status.py create mode 100644 fetch_batched_results.py create mode 100644 submit_batches.py diff --git a/Script.org b/Script.org new file mode 100644 index 0000000..1d37dc5 --- /dev/null +++ b/Script.org @@ -0,0 +1,136 @@ +#+title: Script +#+author: Sébastien Miquel +#+date: 14-03-2026 +# Time-stamp: <22-04-26 10:54> +#+OPTIONS: + +NEEDS : + - `export GEMINI_API_KEY=…` + - fichier `names` in the directory. + +* Prétraitement + + 1. Rotate every single page 180 + `./rotate_all.sh Interro14` + 2. `./rename_to_copie.sh Interro14` + 3. `python page_splitter.py Interro`. + Fix issues with `python page_splitter.py Interro14/Copie01.pdf` + 4. `python cutleft.py Interro` + Rerun on a single file with `python cutleft.py Interro/Copie01.pdf` + 5. `python enonce_info.py Interro` + +* Labelisation et regroupement + +Set proxy with `export HTTPS_PROXY="http://192.168.241.1:3128"` + + 7. `python gemini_for_labels.py Interro`, avec éventuellement `--overwrite` + 8. Vérification visuelle : `python plotting.py Interro` + `python plotting.py InterroTest/Copie01.pdf` + + It also generates les `Copie01.json`, à partir des `Copie01_01.json` + + In case of issue, you may need to + - Reorder the pdf + - Run `python cutleft.py Interro/Copie` + - Run `python gemini_dir_batching.py Interro/Copie` + 9. `python splitting_int.py Interro` + 10. `python grouping.py Interro` + +* Correction et annotation + + Success! Batch Job Name: batches/0hc83m3anayrs5iljygg2v6ozsvxz58k6a5r + +Processing batch_requests_pro.jsonl for model gemini-3.1-pro-preview... + Uploading file... + Uploaded successfully! File ID: files/oasj4aty5kco + Starting batch job... + Success! Batch Job Name: batches/8pk4m2snr17n31pun3vwzn646qvtkj8ao192 + + +Set proxy with `export HTTPS_PROXY="http://10.0.0.1:3128"` + + 1. Il faut créer des persp, pour indication de comment corriger, et + relancer `enonce_info.py` + 2. `python correction.py Interro --limit 240` OU + `python correction.py Interro/Ex\ 2/Group_1.jpg` OU + `python correction.py Interro --overwrite` + + Will it resume ? It seems so. Best to wait a bit. + To batch it : + + `python correction.py Interro --batch` + + `python submit_batches.py Interro` + + `python batch_status.py` + + `python fetch_batched_results.py DS08VB` + + `python correction.py DS08VB --deal-with-batched` + 3. Try `python post-correction.py Interro` ; It makes a + `fixed_correction.json`, to check. + 4. Facultatif : `python annotating.py Interro` dans `Anot`, pass `--overwrite` + 5. + + `python annotating_with_checks.py Interro` dans `Bnot`, pass `--overwrite` + OU + + `python annotating_by_label.py Interro` dans `BGnot` + _Needs_ : label_groups file. (made automatically by this function) + + 6. `python to_tablette.py Interro` + Cela déplace les groupes dans `SyncCopies/À Annoter`. + - Les mettre dans le dossier racine de la tablette, et renommer en `aaa`. + - Vider `Syncthing/Annotées` sur la tablette et localement. + À automatiser, aussi c'est lent… + +* Lecture de la correction manuelle + + 16. Manually : delete `~/SyncCopies/Annotées`, copy from the tablette to here. + + Then `python from_tablette.py Interro` + + 17. + + `python reading_annotations.py Interro` + OU + + `python reading_grouped_annotations.py Interro` + 18. `python giving_names.py InterroTest BGnot` + + It will make `A Rendre` with symlink to the Concat.jpg file + either in Anot or Bnot, and score.json. + + + In case of Unknown : rename both directory and file inside. + + Here, you can change `score.json` manually. + 19. + + `gestion_classe ne` pour créer l'interro puis + + `gestion_classe we` (set barème here) + + `python update_ods.py Interro` + + `gestion_classe re` + + `gestion_classe wsent` + + `python add_final_score.py Interro21` + (this makes files in `Server/copies`) + 20. + + Deploy `miqmacs-copies-assets`, and + + update the copies from `miqmacs.fr/admin`. + +* Recorrection d'une seule copie + +!! Attention, refaire ne marchera pas si tu fais une annotation non +groupée into refaire !! + + 1. Redécoupage + + `python plotting.py InterroTest/Copie01.pdf` + + `python splitting_int.py InterroTest/Copie20.pdf` + 2. Créer `refaire.json`, avec un contenu comme + [["Copie01", []], + ["Copie01", ["Ex 1 : 1)"]]] + 3. Appeler `correction` avec --refaire. Il doit créer des groupes + individuels, faire des requêtes, et remplacer les corrections + précédentes (à sauver ailleurs). + + Ou non, si tu veux le faire à la main. + 4. ?? Si je fais refaire, avant d'avoir créer les annotating with + checks, que se passe-t-il ??? + 5. Appeler `annotating_with_checks.py --refaire --overwrite` avec --refaire. + 6. `python to_tablette.py --refaire Interro24` + 6. `python from_tablette.py --refaire Interro24` + 7. `python reading_grouped_annotations.py --refaire Interro24` + +* Install + +#+BEGIN_SRC bash +pip install highlight-text --break-system-packages +#+END_SRC diff --git a/batch_status.py b/batch_status.py new file mode 100644 index 0000000..16988d3 --- /dev/null +++ b/batch_status.py @@ -0,0 +1,88 @@ +import os +import sys +import argparse +from google import genai + +if "GEMINI_API_KEY" not in os.environ: + sys.exit("Error: GEMINI_API_KEY environment variable not set.") + +client = genai.Client() + +def list_jobs(): + print("Fetching recent batch jobs...\n") + try: + batch_jobs = client.batches.list() + jobs_found = False + + for job in batch_jobs: + jobs_found = True + state = job.state.name if hasattr(job.state, 'name') else job.state + + print("-" * 60) + print(f"Job Name: {job.name}") + + if hasattr(job, 'display_name') and job.display_name: + print(f"Display Name: {job.display_name}") + + print(f"State: {state}") + + if state == 'JOB_STATE_FAILED' and hasattr(job, 'error'): + print(f"Error: {job.error}") + + if state == 'JOB_STATE_SUCCEEDED' and hasattr(job, 'dest') and job.dest: + if hasattr(job.dest, 'file_name') and job.dest.file_name: + print(f"Output File: {job.dest.file_name}") + + if not jobs_found: + print("No batch jobs found.") + else: + print("-" * 60) + print("\nTo download a completed job, run:") + print("python batch_status.py --download batches/") + + except Exception as e: + sys.exit(f"An error occurred while listing jobs: {e}") + + +def download_job(job_name): + print(f"Checking status for {job_name}...\n") + try: + job = client.batches.get(name=job_name) + state = job.state.name if hasattr(job.state, 'name') else job.state + + print(f"State: {state}") + + if state != 'JOB_STATE_SUCCEEDED': + print("Job is not ready yet or has failed.") + if state == 'JOB_STATE_FAILED' and hasattr(job, 'error'): + print(f"Error: {job.error}") + return + + if hasattr(job, 'dest') and job.dest and hasattr(job.dest, 'file_name') and job.dest.file_name: + result_file_name = job.dest.file_name + print(f"Downloading results from {result_file_name}...") + + file_content_bytes = client.files.download(file=result_file_name) + output_path = f"results_{job_name.replace('/', '_')}.jsonl" + + with open(output_path, "wb") as f: + f.write(file_content_bytes) + + print(f"Success! Saved to {output_path}") + print(f"You can now feed this to your correction script using: --deal-with-batched {output_path}") + else: + print("Job succeeded but no output file was found.") + + except Exception as e: + sys.exit(f"An error occurred while fetching the job: {e}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Manage Gemini Batch Jobs") + parser.add_argument("--download", type=str, metavar="JOB_NAME", + help="Download the results for a specific batch job (e.g. batches/123456)") + args = parser.parse_args() + + if args.download: + download_job(args.download) + else: + list_jobs() diff --git a/fetch_batched_results.py b/fetch_batched_results.py new file mode 100644 index 0000000..bba20e2 --- /dev/null +++ b/fetch_batched_results.py @@ -0,0 +1,63 @@ +import os +import sys +import argparse +from pathlib import Path +from google import genai + +def main(): + parser = argparse.ArgumentParser(description="Download and combine completed batch jobs for a directory.") + parser.add_argument("root_dir", type=str, help="Directory containing the original batches") + args = parser.parse_args() + + target_dir = Path(args.root_dir) + dir_name = target_dir.name + output_path = target_dir / "batched_correction_result.jsonl" + + if "GEMINI_API_KEY" not in os.environ: + sys.exit("Error: GEMINI_API_KEY environment variable not set.") + + client = genai.Client() + + print(f"Fetching jobs matching '{dir_name}'...") + all_jobs = client.batches.list() + matching_jobs = [] + + # 1. Find jobs associated with this directory + for job in all_jobs: + if hasattr(job, 'display_name') and job.display_name and dir_name in job.display_name: + matching_jobs.append(job) + + if not matching_jobs: + sys.exit(f"No batch jobs found containing '{dir_name}' in their display name.") + + # 2. Check that all matching jobs are complete + for job in matching_jobs: + state = job.state.name if hasattr(job.state, 'name') else job.state + print(f"Found Job: {job.display_name} | State: {state}") + if state != 'JOB_STATE_SUCCEEDED': + sys.exit(f"Error: Job '{job.display_name}' has not succeeded yet. Try again later.") + + # 3. Download and concatenate + print("\nAll jobs succeeded. Downloading results...") + combined_data = b"" + + for job in matching_jobs: + if hasattr(job, 'dest') and job.dest and hasattr(job.dest, 'file_name') and job.dest.file_name: + print(f"Downloading output for {job.display_name}...") + file_content_bytes = client.files.download(file=job.dest.file_name) + + combined_data += file_content_bytes + # Ensure proper line separation between files in JSONL + if combined_data and not combined_data.endswith(b'\n'): + combined_data += b'\n' + else: + print(f"Warning: Job {job.display_name} succeeded but has no output file.") + + # 4. Save to destination + with open(output_path, "wb") as f: + f.write(combined_data) + + print(f"\nSuccess! All results concatenated and saved to:\n{output_path}") + +if __name__ == "__main__": + main() diff --git a/submit_batches.py b/submit_batches.py new file mode 100644 index 0000000..35ef603 --- /dev/null +++ b/submit_batches.py @@ -0,0 +1,79 @@ +import os +import sys +import argparse +from pathlib import Path +from google import genai +from google.genai import types + +def main(): + parser = argparse.ArgumentParser(description="Upload JSONL files and create Gemini Batch jobs.") + parser.add_argument("root_dir", type=str, help="Root directory containing the batch JSONL files") + args = parser.parse_args() + + root_dir = Path(args.root_dir) + + if "GEMINI_API_KEY" not in os.environ: + sys.exit("Error: GEMINI_API_KEY environment variable not set.") + + client = genai.Client() + + # Define the batch files and their corresponding models + batches_to_create = [ + { + "file_path": root_dir / "batch_requests_flash.jsonl", + "model_id": "gemini-3-flash-preview", + "display_name": f"flash-correction-{root_dir.name}" + }, + { + "file_path": root_dir / "batch_requests_pro.jsonl", + "model_id": "gemini-3.1-pro-preview", + "display_name": f"pro-correction-{root_dir.name}" + } + ] + + for batch in batches_to_create: + file_path = batch["file_path"] + model_id = batch["model_id"] + display_name = batch["display_name"] + + # Check if the file exists + if not file_path.exists(): + print(f"Skipping {model_id}: {file_path.name} does not exist.") + continue + + # Check if the file is empty (e.g., if all tasks went to Flash, Pro might be empty) + if file_path.stat().st_size == 0: + print(f"Skipping {model_id}: {file_path.name} is empty.") + continue + + print(f"Processing {file_path.name} for model {model_id}...") + + # 1. Upload the file to the File API + print(f" Uploading file...") + uploaded_file = client.files.upload( + file=str(file_path), + config=types.UploadFileConfig( + display_name=f"{display_name}-input", + mime_type='jsonl' + ) + ) + print(f" Uploaded successfully! File ID: {uploaded_file.name}") + + # 2. Create the batch job + print(f" Starting batch job...") + batch_job = client.batches.create( + model=model_id, + src=uploaded_file.name, + config={ + 'display_name': display_name, + }, + ) + print(f" Success! Batch Job Name: {batch_job.name}\n") + + print("-" * 50) + print("All batch jobs have been initiated.") + print("Save the Batch Job Names above. You can monitor them with:") + print(" client.batches.get(name='YOUR_BATCH_JOB_NAME')") + +if __name__ == "__main__": + main()