Copies/grouping.py

253 lines
7.8 KiB
Python

import os
import json
import re
import sys
import shutil
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from PIL import Image, ImageDraw, ImageFont
from pdf2image import convert_from_path, pdfinfo_from_path
# Configuration
DPI = 200 # Good balance for readability and size
A4_HEIGHT_INCHES = 11.69
FULL_PAGE_PX = int(A4_HEIGHT_INCHES * DPI)
MAX_GROUP_HEIGHT = 1.5 * FULL_PAGE_PX
MAX_GROUP_COUNT = 8
SEPARATOR_HEIGHT = 20
LABEL_HEIGHT = 50
MAX_FILE_SIZE_BYTES = 2.5 * 1024 * 1024 # 2MB
def get_pdf_height(path):
"""Returns total height of all pages in pixels at defined DPI."""
try:
info = pdfinfo_from_path(path)
# Get page count (default to 1)
num_pages = int(info["Pages"]) if "Pages" in info else 1
# 1 pt = 1/72 inch
pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0
# Height of one page in pixels
single_page_px = int((pts_height / 72.0) * DPI)
# Return total height
return single_page_px * num_pages
except Exception as e:
print(f"Error reading {path}: {e}")
return 0
def collect_files(root_dir):
"""
Scans Dir/Copiedd/identifier.pdf
Returns dict: {identifier: [(dd, path, height), ...]}
"""
data = defaultdict(list)
# Regex to match 'Copie' followed by 2 digits
folder_pattern = re.compile(r'Copie(\d{2})')
for root, dirs, files in os.walk(root_dir):
folder_name = os.path.basename(root)
match = folder_pattern.match(folder_name)
if match:
dd = match.group(1)
for file in files:
if file.lower().endswith('.pdf'):
identifier = os.path.splitext(file)[0]
full_path = os.path.join(root, file)
# Calculate height (c)
height = get_pdf_height(full_path)
# Store triple (a, b, c)
data[identifier].append((dd, full_path, height))
return data
def group_files(file_list):
"""
Groups files using First Fit Decreasing algorithm to minimize group count.
"""
# 1. Sort by height DESCENDING. Large items are hardest to fit, handle them first.
# (Remove this sort if you must strictly preserve input order logic)
sorted_files = sorted(file_list, key=lambda x: x[2], reverse=True)
# Each group is a dict: {'items': [], 'current_height': 0}
groups = []
for item in sorted_files:
dd, path, height = item
placed = False
# 2. Try to fit item into an existing group (First Fit)
for group in groups:
# Check Count Constraint
if len(group['items']) >= MAX_GROUP_COUNT:
continue
# Calculate Overhead (only if group is not empty)
overhead = (SEPARATOR_HEIGHT + 30) if group['items'] else 0
# Check Height Constraint
if group['current_height'] + height + overhead <= MAX_GROUP_HEIGHT:
group['items'].append(item)
group['current_height'] += height + overhead
placed = True
break
# 3. If it doesn't fit anywhere, create a new group
if not placed:
groups.append({
'items': [item],
'current_height': height
})
# Return list of lists (strip the metadata)
return [g['items'] for g in groups]
def stitch_pdf_pages(images_list):
"""Vertically concatenates a list of PIL images with no separator."""
if not images_list:
return None
if len(images_list) == 1:
return images_list[0]
max_width = max(img.width for img in images_list)
total_height = sum(img.height for img in images_list)
combined = Image.new('RGB', (max_width, total_height), 'white')
y_offset = 0
for img in images_list:
combined.paste(img, (0, y_offset))
y_offset += img.height
return combined
def create_jpg(identifier, group_index, group, root_dir):
images = []
metadata = [] # To store (id, h_min, h_max)
# Render PDFs to images
for dd, path, _ in group:
try:
# Convert pdf to image
imgs = convert_from_path(path, dpi=DPI)
if imgs:
# Concatenate multi-page PDFs into one single image object
combined_img = stitch_pdf_pages(imgs)
if combined_img:
images.append((dd, combined_img))
except Exception as e:
print(f"Failed to convert {path}: {e}")
if not images:
return
# Calculate total canvas size
total_width = max(img.width for _, img in images)
total_height = sum(img.height for _, img in images) + ((len(images) - 1) * SEPARATOR_HEIGHT)
# Add space for text (approx 40px per label)
total_height += len(images) * LABEL_HEIGHT
canvas = Image.new('RGB', (total_width, total_height), 'white')
draw = ImageDraw.Draw(canvas)
# Try loading a font, fallback to default
try:
font = ImageFont.truetype("DejaVuSans.ttf", 40)
except IOError:
print("font not found")
font = ImageFont.load_default()
y_offset = 0
for i, (dd, img) in enumerate(images):
# Draw separator if not first image
if i > 0:
draw.rectangle([0, y_offset, total_width, y_offset + SEPARATOR_HEIGHT], fill='black')
y_offset += SEPARATOR_HEIGHT
# Draw Text (dd)
text = f"ID: {dd}"
draw.text((10, y_offset + 5), text, fill='black', font=font)
y_offset += LABEL_HEIGHT # Space for text
# Record Image Coordinates
h_min = y_offset
h_max = y_offset + img.height
metadata.append((dd, h_min, h_max))
# Draw Image
x_pos = 0
canvas.paste(img, (x_pos, y_offset))
y_offset += img.height
target_folder = os.path.join(root_dir, identifier)
os.makedirs(target_folder, exist_ok=True)
# Save JSON metadata
json_filename = f"Group_{group_index+1}.json"
json_path = os.path.join(target_folder, json_filename)
with open(json_path, 'w') as f:
json.dump(metadata, f)
# Save with size constraints
output_filename = f"Group_{group_index+1}.jpg"
output_path = os.path.join(target_folder, output_filename)
quality = 90
while quality > 10:
canvas.save(output_path, "JPEG", quality=quality, optimize=True)
if os.path.getsize(output_path) <= MAX_FILE_SIZE_BYTES:
if quality < 90:
print("quality : ", quality)
break
quality -= 5
print(f"Saved {output_path} with {len(group)} ({os.path.getsize(output_path)/1024/1024:.2f} MB)")
def natural_key(text):
return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]
def process_identifier(identifier, files_info, root_dir):
# Clear output directory if it exists
target_folder = os.path.join(root_dir, identifier)
if os.path.exists(target_folder):
shutil.rmtree(target_folder)
os.makedirs(target_folder, exist_ok=True)
# files_info is list of (dd, path, height)
file_groups = group_files(files_info)
for idx, group in enumerate(file_groups):
create_jpg(identifier, idx, group, root_dir)
def main():
if len(sys.argv) < 2:
print("Usage: python app.py <Path_to_Dir>")
sys.exit(1)
root_dir = sys.argv[1]
print("Scanning files...")
data = collect_files(root_dir)
print(f"Found {len(data)} identifiers. Processing...")
# Sort identifiers naturally
sorted_identifiers = sorted(data.keys(), key=natural_key)
# Process using 4 threads
with ThreadPoolExecutor(max_workers=4) as executor:
for identifier in sorted_identifiers:
executor.submit(process_identifier, identifier, data[identifier], root_dir)
print("Done.")
if __name__ == "__main__":
main()