Copies/grouping.py

import os
import json
import re
import sys
import shutil
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from PIL import Image, ImageDraw, ImageFont
from pdf2image import convert_from_path, pdfinfo_from_path

# Configuration
DPI = 200  # Good balance for readability and size
A4_HEIGHT_INCHES = 11.69
FULL_PAGE_PX = int(A4_HEIGHT_INCHES * DPI)
MAX_GROUP_HEIGHT = 1.5 * FULL_PAGE_PX
MAX_GROUP_COUNT = 8
SEPARATOR_HEIGHT = 20
LABEL_HEIGHT = 50
MAX_FILE_SIZE_BYTES = 2.5 * 1024 * 1024  # 2MB

def get_pdf_height(path):
    """Returns total height of all pages in pixels at defined DPI."""
    try:
        info = pdfinfo_from_path(path)
        # Get page count (default to 1)
        num_pages = int(info["Pages"]) if "Pages" in info else 1

        # 1 pt = 1/72 inch
        pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0

        # Height of one page in pixels
        single_page_px = int((pts_height / 72.0) * DPI)

        # Return total height
        return single_page_px * num_pages
    except Exception as e:
        print(f"Error reading {path}: {e}")
        return 0

def collect_files(root_dir):
    """
    Scans Dir/Copiedd/identifier.pdf
    Returns dict: {identifier: [(dd, path, height), ...]}
    """
    data = defaultdict(list)

    # Regex to match 'Copie' followed by 2 digits
    folder_pattern = re.compile(r'Copie(\d{2})')

    for root, dirs, files in os.walk(root_dir):
        folder_name = os.path.basename(root)
        match = folder_pattern.match(folder_name)

        if match:
            dd = match.group(1)
            for file in files:
                if file.lower().endswith('.pdf'):
                    identifier = os.path.splitext(file)[0]
                    full_path = os.path.join(root, file)

                    # Calculate height (c)
                    height = get_pdf_height(full_path)

                    # Store triple (a, b, c)
                    data[identifier].append((dd, full_path, height))
    return data

def group_files(file_list):
    """
    Groups files using First Fit Decreasing algorithm to minimize group count.
    """
    # 1. Sort by height DESCENDING. Large items are hardest to fit, handle them first.
    #    (Remove this sort if you must strictly preserve input order logic)
    sorted_files = sorted(file_list, key=lambda x: x[2], reverse=True)

    # Each group is a dict: {'items': [], 'current_height': 0}
    groups = []

    for item in sorted_files:
        dd, path, height = item
        placed = False

        # 2. Try to fit item into an existing group (First Fit)
        for group in groups:
            # Check Count Constraint
            if len(group['items']) >= MAX_GROUP_COUNT:
                continue

            # Calculate Overhead (only if group is not empty)
            overhead = (SEPARATOR_HEIGHT + 30) if group['items'] else 0

            # Check Height Constraint
            if group['current_height'] + height + overhead <= MAX_GROUP_HEIGHT:
                group['items'].append(item)
                group['current_height'] += height + overhead
                placed = True
                break

        # 3. If it doesn't fit anywhere, create a new group
        if not placed:
            groups.append({
                'items': [item],
                'current_height': height
            })

    # Return list of lists (strip the metadata)
    return [g['items'] for g in groups]

def stitch_pdf_pages(images_list):
    """Vertically concatenates a list of PIL images with no separator."""
    if not images_list:
        return None
    if len(images_list) == 1:
        return images_list[0]

    max_width = max(img.width for img in images_list)
    total_height = sum(img.height for img in images_list)

    combined = Image.new('RGB', (max_width, total_height), 'white')

    y_offset = 0
    for img in images_list:
        combined.paste(img, (0, y_offset))
        y_offset += img.height

    return combined

def create_jpg(identifier, group_index, group, root_dir):
    images = []
    metadata = []  # To store (id, h_min, h_max)

    # Render PDFs to images
    for dd, path, _ in group:
        try:
            # Convert pdf to image
            imgs = convert_from_path(path, dpi=DPI)
            if imgs:
                # Concatenate multi-page PDFs into one single image object
                combined_img = stitch_pdf_pages(imgs)
                if combined_img:
                    images.append((dd, combined_img))
        except Exception as e:
            print(f"Failed to convert {path}: {e}")

    if not images:
        return

    # Calculate total canvas size
    total_width = max(img.width for _, img in images)
    total_height = sum(img.height for _, img in images) + ((len(images) - 1) * SEPARATOR_HEIGHT)

    # Add space for text (approx 40px per label)
    total_height += len(images) * LABEL_HEIGHT

    canvas = Image.new('RGB', (total_width, total_height), 'white')
    draw = ImageDraw.Draw(canvas)

    # Try loading a font, fallback to default
    try:
        font = ImageFont.truetype("DejaVuSans.ttf", 40)
    except IOError:
        print("font not found")
        font = ImageFont.load_default()

    y_offset = 0

    for i, (dd, img) in enumerate(images):
        # Draw separator if not first image
        if i > 0:
            draw.rectangle([0, y_offset, total_width, y_offset + SEPARATOR_HEIGHT], fill='black')
            y_offset += SEPARATOR_HEIGHT

        # Draw Text (dd)
        text = f"ID: {dd}"
        draw.text((10, y_offset + 5), text, fill='black', font=font)
        y_offset += LABEL_HEIGHT # Space for text

        # Record Image Coordinates
        h_min = y_offset
        h_max = y_offset + img.height
        # identifier should be a label
        metadata.append((dd, h_min, h_max, img.width/total_width, identifier))

        # Draw Image
        x_pos = 0
        canvas.paste(img, (x_pos, y_offset))
        y_offset += img.height

    target_folder = os.path.join(root_dir, identifier)
    os.makedirs(target_folder, exist_ok=True)

    # Save JSON metadata
    json_filename = f"Group_{group_index+1}.json"
    json_path = os.path.join(target_folder, json_filename)
    with open(json_path, 'w') as f:
        json.dump(metadata, f)

    # Save with size constraints
    output_filename = f"Group_{group_index+1}.jpg"
    output_path = os.path.join(target_folder, output_filename)

    quality = 90
    while quality > 10:
        canvas.save(output_path, "JPEG", quality=quality, optimize=True)
        if os.path.getsize(output_path) <= MAX_FILE_SIZE_BYTES:
            if quality < 90:
                print("quality : ", quality)
            break
        quality -= 5

    print(f"Saved {output_path} with {len(group)} ({os.path.getsize(output_path)/1024/1024:.2f} MB)")

def natural_key(text):
    return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]


def process_identifier(identifier, files_info, root_dir):
    # Clear output directory if it exists
    target_folder = os.path.join(root_dir, identifier)
    if os.path.exists(target_folder):
        shutil.rmtree(target_folder)
    os.makedirs(target_folder, exist_ok=True)

    # files_info is list of (dd, path, height)
    file_groups = group_files(files_info)

    for idx, group in enumerate(file_groups):
        create_jpg(identifier, idx, group, root_dir)

def main():
    if len(sys.argv) < 2:
        print("Usage: python app.py <Path_to_Dir>")
        sys.exit(1)

    root_dir = sys.argv[1]

    print("Scanning files...")
    data = collect_files(root_dir)

    print(f"Found {len(data)} identifiers. Processing...")

    # Sort identifiers naturally
    sorted_identifiers = sorted(data.keys(), key=natural_key)

    # Process using 4 threads
    with ThreadPoolExecutor(max_workers=4) as executor:
        for identifier in sorted_identifiers:
            executor.submit(process_identifier, identifier, data[identifier], root_dir)

    print("Done.")

if __name__ == "__main__":
    main()