254 lines
7.9 KiB
Python
254 lines
7.9 KiB
Python
import os
|
|
import json
|
|
import re
|
|
import sys
|
|
import shutil
|
|
from collections import defaultdict
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
from pdf2image import convert_from_path, pdfinfo_from_path
|
|
|
|
# Configuration
|
|
DPI = 200 # Good balance for readability and size
|
|
A4_HEIGHT_INCHES = 11.69
|
|
FULL_PAGE_PX = int(A4_HEIGHT_INCHES * DPI)
|
|
MAX_GROUP_HEIGHT = 1.5 * FULL_PAGE_PX
|
|
MAX_GROUP_COUNT = 8
|
|
SEPARATOR_HEIGHT = 20
|
|
LABEL_HEIGHT = 50
|
|
MAX_FILE_SIZE_BYTES = 2.5 * 1024 * 1024 # 2MB
|
|
|
|
def get_pdf_height(path):
|
|
"""Returns total height of all pages in pixels at defined DPI."""
|
|
try:
|
|
info = pdfinfo_from_path(path)
|
|
# Get page count (default to 1)
|
|
num_pages = int(info["Pages"]) if "Pages" in info else 1
|
|
|
|
# 1 pt = 1/72 inch
|
|
pts_height = float(info['Page size'].split(' ')[2]) if 'Page size' in info else 0
|
|
|
|
# Height of one page in pixels
|
|
single_page_px = int((pts_height / 72.0) * DPI)
|
|
|
|
# Return total height
|
|
return single_page_px * num_pages
|
|
except Exception as e:
|
|
print(f"Error reading {path}: {e}")
|
|
return 0
|
|
|
|
def collect_files(root_dir):
|
|
"""
|
|
Scans Dir/Copiedd/identifier.pdf
|
|
Returns dict: {identifier: [(dd, path, height), ...]}
|
|
"""
|
|
data = defaultdict(list)
|
|
|
|
# Regex to match 'Copie' followed by 2 digits
|
|
folder_pattern = re.compile(r'Copie(\d{2})')
|
|
|
|
for root, dirs, files in os.walk(root_dir):
|
|
folder_name = os.path.basename(root)
|
|
match = folder_pattern.match(folder_name)
|
|
|
|
if match:
|
|
dd = match.group(1)
|
|
for file in files:
|
|
if file.lower().endswith('.pdf'):
|
|
identifier = os.path.splitext(file)[0]
|
|
full_path = os.path.join(root, file)
|
|
|
|
# Calculate height (c)
|
|
height = get_pdf_height(full_path)
|
|
|
|
# Store triple (a, b, c)
|
|
data[identifier].append((dd, full_path, height))
|
|
return data
|
|
|
|
def group_files(file_list):
|
|
"""
|
|
Groups files using First Fit Decreasing algorithm to minimize group count.
|
|
"""
|
|
# 1. Sort by height DESCENDING. Large items are hardest to fit, handle them first.
|
|
# (Remove this sort if you must strictly preserve input order logic)
|
|
sorted_files = sorted(file_list, key=lambda x: x[2], reverse=True)
|
|
|
|
# Each group is a dict: {'items': [], 'current_height': 0}
|
|
groups = []
|
|
|
|
for item in sorted_files:
|
|
dd, path, height = item
|
|
placed = False
|
|
|
|
# 2. Try to fit item into an existing group (First Fit)
|
|
for group in groups:
|
|
# Check Count Constraint
|
|
if len(group['items']) >= MAX_GROUP_COUNT:
|
|
continue
|
|
|
|
# Calculate Overhead (only if group is not empty)
|
|
overhead = (SEPARATOR_HEIGHT + 30) if group['items'] else 0
|
|
|
|
# Check Height Constraint
|
|
if group['current_height'] + height + overhead <= MAX_GROUP_HEIGHT:
|
|
group['items'].append(item)
|
|
group['current_height'] += height + overhead
|
|
placed = True
|
|
break
|
|
|
|
# 3. If it doesn't fit anywhere, create a new group
|
|
if not placed:
|
|
groups.append({
|
|
'items': [item],
|
|
'current_height': height
|
|
})
|
|
|
|
# Return list of lists (strip the metadata)
|
|
return [g['items'] for g in groups]
|
|
|
|
def stitch_pdf_pages(images_list):
|
|
"""Vertically concatenates a list of PIL images with no separator."""
|
|
if not images_list:
|
|
return None
|
|
if len(images_list) == 1:
|
|
return images_list[0]
|
|
|
|
max_width = max(img.width for img in images_list)
|
|
total_height = sum(img.height for img in images_list)
|
|
|
|
combined = Image.new('RGB', (max_width, total_height), 'white')
|
|
|
|
y_offset = 0
|
|
for img in images_list:
|
|
combined.paste(img, (0, y_offset))
|
|
y_offset += img.height
|
|
|
|
return combined
|
|
|
|
def create_jpg(identifier, group_index, group, root_dir):
|
|
images = []
|
|
metadata = [] # To store (id, h_min, h_max)
|
|
|
|
# Render PDFs to images
|
|
for dd, path, _ in group:
|
|
try:
|
|
# Convert pdf to image
|
|
imgs = convert_from_path(path, dpi=DPI)
|
|
if imgs:
|
|
# Concatenate multi-page PDFs into one single image object
|
|
combined_img = stitch_pdf_pages(imgs)
|
|
if combined_img:
|
|
images.append((dd, combined_img))
|
|
except Exception as e:
|
|
print(f"Failed to convert {path}: {e}")
|
|
|
|
if not images:
|
|
return
|
|
|
|
# Calculate total canvas size
|
|
total_width = max(img.width for _, img in images)
|
|
total_height = sum(img.height for _, img in images) + ((len(images) - 1) * SEPARATOR_HEIGHT)
|
|
|
|
# Add space for text (approx 40px per label)
|
|
total_height += len(images) * LABEL_HEIGHT
|
|
|
|
canvas = Image.new('RGB', (total_width, total_height), 'white')
|
|
draw = ImageDraw.Draw(canvas)
|
|
|
|
# Try loading a font, fallback to default
|
|
try:
|
|
font = ImageFont.truetype("DejaVuSans.ttf", 40)
|
|
except IOError:
|
|
print("font not found")
|
|
font = ImageFont.load_default()
|
|
|
|
y_offset = 0
|
|
|
|
for i, (dd, img) in enumerate(images):
|
|
# Draw separator if not first image
|
|
if i > 0:
|
|
draw.rectangle([0, y_offset, total_width, y_offset + SEPARATOR_HEIGHT], fill='black')
|
|
y_offset += SEPARATOR_HEIGHT
|
|
|
|
# Draw Text (dd)
|
|
text = f"ID: {dd}"
|
|
draw.text((10, y_offset + 5), text, fill='black', font=font)
|
|
y_offset += LABEL_HEIGHT # Space for text
|
|
|
|
# Record Image Coordinates
|
|
h_min = y_offset
|
|
h_max = y_offset + img.height
|
|
# identifier should be a label
|
|
metadata.append((dd, h_min, h_max, img.width/total_width, identifier))
|
|
|
|
# Draw Image
|
|
x_pos = 0
|
|
canvas.paste(img, (x_pos, y_offset))
|
|
y_offset += img.height
|
|
|
|
target_folder = os.path.join(root_dir, identifier)
|
|
os.makedirs(target_folder, exist_ok=True)
|
|
|
|
# Save JSON metadata
|
|
json_filename = f"Group_{group_index+1}.json"
|
|
json_path = os.path.join(target_folder, json_filename)
|
|
with open(json_path, 'w') as f:
|
|
json.dump(metadata, f)
|
|
|
|
# Save with size constraints
|
|
output_filename = f"Group_{group_index+1}.jpg"
|
|
output_path = os.path.join(target_folder, output_filename)
|
|
|
|
quality = 90
|
|
while quality > 10:
|
|
canvas.save(output_path, "JPEG", quality=quality, optimize=True)
|
|
if os.path.getsize(output_path) <= MAX_FILE_SIZE_BYTES:
|
|
if quality < 90:
|
|
print("quality : ", quality)
|
|
break
|
|
quality -= 5
|
|
|
|
print(f"Saved {output_path} with {len(group)} ({os.path.getsize(output_path)/1024/1024:.2f} MB)")
|
|
|
|
def natural_key(text):
|
|
return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', str(text))]
|
|
|
|
|
|
def process_identifier(identifier, files_info, root_dir):
|
|
# Clear output directory if it exists
|
|
target_folder = os.path.join(root_dir, identifier)
|
|
if os.path.exists(target_folder):
|
|
shutil.rmtree(target_folder)
|
|
os.makedirs(target_folder, exist_ok=True)
|
|
|
|
# files_info is list of (dd, path, height)
|
|
file_groups = group_files(files_info)
|
|
|
|
for idx, group in enumerate(file_groups):
|
|
create_jpg(identifier, idx, group, root_dir)
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python app.py <Path_to_Dir>")
|
|
sys.exit(1)
|
|
|
|
root_dir = sys.argv[1]
|
|
|
|
print("Scanning files...")
|
|
data = collect_files(root_dir)
|
|
|
|
print(f"Found {len(data)} identifiers. Processing...")
|
|
|
|
# Sort identifiers naturally
|
|
sorted_identifiers = sorted(data.keys(), key=natural_key)
|
|
|
|
# Process using 4 threads
|
|
with ThreadPoolExecutor(max_workers=4) as executor:
|
|
for identifier in sorted_identifiers:
|
|
executor.submit(process_identifier, identifier, data[identifier], root_dir)
|
|
|
|
print("Done.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|