COCO dataset to YOLO format converter

Posted on Jul 15, 2025

1. Detection


from tqdm import tqdm

def convert_coco_to_yolo(json_path, labels_dir):
    with open(json_path) as f:
        coco = json.load(f)

    # Map image_id -> file_name
    id2file = {img['id']: img['file_name'] for img in coco['images']}
    id2size = {img['id']: (img['width'], img['height']) for img in coco['images']}
    labels_dir.mkdir(parents=True, exist_ok=True)

    for ann in tqdm(coco['annotations'], desc=f"Converting {json_path.name}"):
        image_id = ann['image_id']
        cat_id = ann['category_id'] - 1  # zero-indexed class IDs
        bbox = ann['bbox']
        x, y, w, h = bbox
        img_w, img_h = id2size[image_id]

        # Convert to YOLO format
        cx = (x + w / 2) / img_w
        cy = (y + h / 2) / img_h
        nw = w / img_w
        nh = h / img_h

        label_file = labels_dir / f"{Path(id2file[image_id]).stem}.txt"
        with open(label_file, "a") as f:
            f.write(f"{cat_id} {cx:.6f} {cy:.6f} {nw:.6f} {nh:.6f}\n")

# Run it for all splits
root = Path("/home/ngx/Documents/projects/pocr/input/new_coco_dataset")
convert_coco_to_yolo(root / "annotations/annotations_train.json", root / "labels/train")
convert_coco_to_yolo(root / "annotations/annotations_val.json", root / "labels/val")

2. Segmentation

from shutil import copy2

# === CONFIG ===
source_root = Path("../input/new_coco_dataset")
target_root = Path("../input/new_coco_seg")

splits = ["train", "val", "test"]
yaml_dict = {
    'train': source_root/'images/train',
    'val': source_root/'images/val',
    'test': source_root/'images/test',  # optional
    'nc': 1,
    'names': ['text']
}

# === Make folders ===
for split in splits:
    (target_root / f"images/{split}").mkdir(parents=True, exist_ok=True)
    (target_root / f"labels/{split}").mkdir(parents=True, exist_ok=True)

# === Function to convert COCO to YOLO-seg ===
def coco_to_yolo_seg(coco_json_path: Path, image_dir: Path, label_dir: Path):
    with open(coco_json_path, 'r') as f:
        coco = json.load(f)

    id2image = {img["id"]: img for img in coco["images"]}
    ann_map = {}
    for ann in coco["annotations"]:
        if ann.get("iscrowd", 0):
            continue
        ann_map.setdefault(ann["image_id"], []).append(ann)

    for img in tqdm(coco["images"], desc=f"Converting {coco_json_path.name}"):
        file_name = img["file_name"]
        img_path = image_dir / file_name
        out_img_path = target_root / image_dir.relative_to(source_root) / file_name
        out_img_path.parent.mkdir(parents=True, exist_ok=True)
        copy2(img_path, out_img_path)

        label_path = label_dir / f"{Path(file_name).stem}.txt"
        with open(label_path, "w") as f:
            for ann in ann_map.get(img["id"], []):
                cls_id = ann["category_id"] - 1  # COCO classes start from 1
                for seg in ann["segmentation"]:
                    if len(seg) < 6:
                        continue
                    pts = [
                        f"{seg[i] / img['width']:.6f} {seg[i + 1] / img['height']:.6f}"
                        for i in range(0, len(seg), 2)
                    ]
                    line = f"{cls_id} " + " ".join(pts)
                    f.write(line + "\n")

# === Run conversion for all splits ===
for split in splits:
    coco_json = source_root / f"annotations/annotations_{split}.json"
    img_dir = source_root / f"images/{split}"
    lbl_dir = target_root / f"labels/{split}"

    if coco_json.exists() and img_dir.exists():
        coco_to_yolo_seg(coco_json, img_dir, lbl_dir)

# === Write YAML ===
yaml_path = target_root / "data.yaml"
with open(yaml_path, "w") as f:
    f.write(f"path: {target_root.resolve()}\n")
    f.write(f"train: images/train\n")
    f.write(f"val: images/val\n")
    f.write(f"test: images/test\n")
    f.write(f"nc: {yaml_dict['nc']}\n")
    f.write(f"names: {yaml_dict['names']}\n")

print("✅ COCO ➜ YOLOv8 segmentation format conversion complete.")
print(f"📄 data.yaml saved to: {yaml_path}")