master
Sébastien Miquel 2026-04-23 12:41:50 +02:00
parent bc47f81556
commit dd0d757fc9
4 changed files with 96 additions and 16 deletions

View File

@ -12,8 +12,7 @@ import annotating_with_checks
from utils import natural_key
# Roughly 10 A4 pages at 100 DPI
MAX_HEIGHT_PX = 20000 # Can be increased by 10%.
MAX_HEIGHT_PX = 25000 # Can be increased by 10%.
def render_item(item):
student_id, label, content = item
@ -134,15 +133,32 @@ def main():
shutil.rmtree(bgnot_dir)
os.makedirs(bgnot_dir, exist_ok=True)
used_prefixes = set()
previous_prefix = None
for line in lines:
labels = [l.strip() for l in line.split(',') if l.strip()]
safe_labels = [l.replace(":", "").strip() for l in line.split(',') if l.strip()]
if not labels:
continue
prefix = os.path.commonprefix(safe_labels).strip()
if not prefix:
prefix = "Group"
base_prefix = os.path.commonprefix(safe_labels).strip()
if not base_prefix:
base_prefix = "Group"
unique_prefix = base_prefix
if unique_prefix[-1] == "i":
unique_prefix = unique_prefix[:-1]
counter = 2
while unique_prefix in used_prefixes:
unique_prefix = f"{base_prefix}-{counter}"
counter += 1
if counter == 2 and previous_prefix and previous_prefix in unique_prefix:
unique_prefix = f"{previous_prefix}-{counter}"
elif counter == 2:
previous_prefx = unique_prefix
used_prefixes.add(unique_prefix)
items_to_render = []
for sid, lbls in results.items():
@ -201,7 +217,7 @@ def main():
batches = batches2
for i, batch in enumerate(batches, 1):
save_batch(batch, prefix, i, root_dir, args.overwrite)
save_batch(batch, unique_prefix, i, root_dir, args.overwrite)
if __name__ == "__main__":
main()

View File

@ -20,7 +20,7 @@ parser.add_argument("--refaire", action="store_true",
help="Redo specific copies/labels defined in refaire.json")
parser.add_argument("--batch", action="store_true",
help="Generate a JSONL file of requests to send to the Gemini Batch API")
parser.add_argument("--deal-with-batched", type=str, metavar="FILE",
parser.add_argument("--deal-with-batched", action="store_true",
help="Process a JSONL file containing completed batch results")
args, _ = parser.parse_known_args()
@ -236,6 +236,44 @@ class EvaluationEntry(BaseModel):
id: str = Field(description="Entry identifier")
result: ResultData = Field(description="Result details")
# These nested definitions do not work with the batch api, unroll them
UNROLLED_SCHEMA = {
"type": "ARRAY",
"items": {
"type": "OBJECT",
"properties": {
"id": {"type": "STRING", "description": "Entry identifier"},
"result": {
"type": "OBJECT",
"properties": {
"score": {"type": "NUMBER", "description": "The numeric score"},
"confidence": {"type": "NUMBER", "description": "Confidence level"},
"error": {"type": "STRING", "description": "Indicates if an error occurred"},
"feedback": {
"type": "ARRAY",
"description": "List of feedback items",
"items": {
"type": "OBJECT",
"properties": {
"text": {"type": "STRING", "description": "Feedback content"},
"box_2d": {
"type": "ARRAY",
"items": {"type": "INTEGER"},
"nullable": True,
"description": "box coordinates or null"
}
},
"required": ["text"]
}
}
},
"required": ["score", "confidence", "feedback", "error"]
}
},
"required": ["id", "result"]
}
}
# The root model for parsing is be: List[EvaluationEntry]
def generate_request(file, full_label):
"""Generates request for Gemini."""
@ -780,7 +818,8 @@ if __name__ == "__main__":
"topP": 0.95,
"maxOutputTokens": 65535,
"responseMimeType": "application/json",
"responseSchema": TypeAdapter(List[EvaluationEntry]).json_schema()
"responseSchema": UNROLLED_SCHEMA
# TypeAdapter(List[EvaluationEntry]).json_schema()
}
}
}
@ -800,7 +839,7 @@ if __name__ == "__main__":
batched_responses = {}
if args.deal_with_batched:
batch_results_path = Path(args.deal_with_batched)
batch_results_path = Path(INPUT_DIR) / "batched_correction_result.jsonl"
if batch_results_path.exists():
print(f"Loading batch results from {batch_results_path}...")
with open(batch_results_path, "r", encoding="utf-8") as f:

View File

@ -7395,6 +7395,7 @@ ennuis
ennuyer
ennuyeux
énoncer
énoncé
énonciation
énorme
énormément

View File

@ -39,23 +39,47 @@ def fast_fix(text):
broken_word = match.group(0)
# Return the fixed word from our map, or leave it if not found
# (Handles case-insensitivity by falling back to lowercase map)
return lookup_map.get(broken_word.lower(), broken_word)
fixed = lookup_map.get(broken_word.lower())
# if not fixed:
# print(f"No match found for: {repr(broken_word)}")
return fixed or broken_word
return re.sub(r'[a-zA-Z\x00]+', replacer, text)
return text
# return text
INPUT_FILE = Path(INPUT_DIR) / "correction.json"
OUTPUT_FILE = Path(INPUT_DIR) / "correction.json"
def fix_hex_corruption_safe(text):
# Only matches \x00 followed by hex if it results in an accented character
# or common Latin-1 symbols
return re.sub(r'\x00([eEfF][0-9a-fA-F])',
lambda m: chr(int(m.group(1), 16)),
text)
def some_other_replacements(s):
s = s.replace("\neq", "\\neq")
s = s.replace("\not", "\\not")
return s
def clean_string(s: str) -> str:
# fix encoding issues
s = ftfy.fix_text(s)
# s = ftfy.fix_text(s)
# print(s)
s = fix_hex_corruption_safe(s)
s = s.replace('\x19', '\x00')
s = s.replace('\x18', '\x00')
s = fast_fix(s)
s = s.replace('\x00', '')
return s
s = s.replace('\x00\x00', '\x00')
s = re.sub(r' \x00{1,2} ', ' à ', s)
if '\x00' in s:
s = fast_fix(s)
s = s.replace('\x00', '')
return some_other_replacements(s)
def clean_obj(obj):