PaddlePaddle dataset to COCO dataset format converter

Posted on Jul 15, 2025
# import json
# import os
# import shutil
# from pathlib import Path
# from PIL import Image
# from sklearn.model_selection import train_test_split
# import uuid

# ---- Config ----
annotation_file = "../input/nano-rec/Label.txt"
images_dir = Path("../input/nano-rec")
output_dir = Path("../input/new_coco_dataset")
output_dir.mkdir(parents=True, exist_ok=True)
splits = ['train', 'val', 'test']

# Create folders
for split in splits:
    (output_dir / "images" / split).mkdir(parents=True, exist_ok=True)
    (output_dir / "annotations").mkdir(parents=True, exist_ok=True)

# ---- Read PaddleOCR-style annotations ----
with open(annotation_file, 'r') as f:
    lines = f.readlines()

data = []
for line in lines:
    line = line.strip()
    if not line:
        continue
    img_path, ann_json = line.split('\t')
    full_img_path = images_dir / img_path
    ann = json.loads(ann_json)
    data.append((str(full_img_path), ann))

# ---- Split train/val/test ----
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

splits_data = {
    'train': train_data,
    'val': val_data,
    'test': test_data
}

# ---- COCO JSON Construction ----
def create_coco_json(data_split, split_name):
    images = []
    annotations = []
    ann_id = 1  # COCO annotation IDs start at 1
    image_id_map = {}

    for img_idx, (img_path, anns) in enumerate(data_split):
        img = Image.open(img_path)
        width, height = img.size
        file_name = Path(img_path).name
        image_id = img_idx + 1  # unique image ID

        # Copy image to output
        target_path = output_dir / "images" / split_name / file_name
        shutil.copy(img_path, target_path)

        # Add image entry
        images.append({
            "id": image_id,
            "file_name": file_name,
            "width": width,
            "height": height
        })
        image_id_map[file_name] = image_id

        # Add annotation entries
        for obj in anns:
            if obj.get("difficult", False):
                continue

            points = obj["points"]
            xs = [p[0] for p in points]
            ys = [p[1] for p in points]
            x_min = min(xs)
            y_min = min(ys)
            w = max(xs) - x_min
            h = max(ys) - y_min

            annotations.append({
                "id": ann_id,
                "image_id": image_id,
                "category_id": 1,
                "bbox": [x_min, y_min, w, h],
                "area": w * h,
                "iscrowd": 0,
                "segmentation": [sum(points, [])]  # flatten the list
            })
            ann_id += 1

    # Build final JSON
    coco_output = {
        "info": {
            "description": f"{split_name} dataset",
            "version": "1.0",
            "year": 2025
        },
        "licenses": [],
        "images": images,
        "annotations": annotations,
        "categories": [
            {"id": 1, "name": "text", "supercategory": "none"}
        ]
    }

    # Save JSON
    json_path = output_dir / "annotations" / f"annotations_{split_name}.json"
    with open(json_path, 'w') as jf:
        json.dump(coco_output, jf, indent=2)
    print(f"{split_name} COCO JSON saved to: {json_path}")

# ---- Run for all splits ----
for split in splits:
    create_coco_json(splits_data[split], split)
# Customize paths based on your structure
yaml_dict = {
    'train': output_dir/'images/train',
    'val': output_dir/'images/val',
    'test': output_dir/'images/test',  # optional for evaluation
    'nc': 1,
    'names': ['text']
}

yaml_path = output_dir/Path("coco.yaml")
with open(yaml_path, 'w') as f:
    yaml.dump(yaml_dict, f, sort_keys=False)

print(f"✅ coco.yaml saved to: {yaml_path}")
print("✅ COCO conversion complete.")