# import json
# import os
# import shutil
# from pathlib import Path
# from PIL import Image
# from sklearn.model_selection import train_test_split
# import uuid
# ---- Config ----
annotation_file = "../input/nano-rec/Label.txt"
images_dir = Path("../input/nano-rec")
output_dir = Path("../input/new_coco_dataset")
output_dir.mkdir(parents=True, exist_ok=True)
splits = ['train', 'val', 'test']
# Create folders
for split in splits:
(output_dir / "images" / split).mkdir(parents=True, exist_ok=True)
(output_dir / "annotations").mkdir(parents=True, exist_ok=True)
# ---- Read PaddleOCR-style annotations ----
with open(annotation_file, 'r') as f:
lines = f.readlines()
data = []
for line in lines:
line = line.strip()
if not line:
continue
img_path, ann_json = line.split('\t')
full_img_path = images_dir / img_path
ann = json.loads(ann_json)
data.append((str(full_img_path), ann))
# ---- Split train/val/test ----
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)
splits_data = {
'train': train_data,
'val': val_data,
'test': test_data
}
# ---- COCO JSON Construction ----
def create_coco_json(data_split, split_name):
images = []
annotations = []
ann_id = 1 # COCO annotation IDs start at 1
image_id_map = {}
for img_idx, (img_path, anns) in enumerate(data_split):
img = Image.open(img_path)
width, height = img.size
file_name = Path(img_path).name
image_id = img_idx + 1 # unique image ID
# Copy image to output
target_path = output_dir / "images" / split_name / file_name
shutil.copy(img_path, target_path)
# Add image entry
images.append({
"id": image_id,
"file_name": file_name,
"width": width,
"height": height
})
image_id_map[file_name] = image_id
# Add annotation entries
for obj in anns:
if obj.get("difficult", False):
continue
points = obj["points"]
xs = [p[0] for p in points]
ys = [p[1] for p in points]
x_min = min(xs)
y_min = min(ys)
w = max(xs) - x_min
h = max(ys) - y_min
annotations.append({
"id": ann_id,
"image_id": image_id,
"category_id": 1,
"bbox": [x_min, y_min, w, h],
"area": w * h,
"iscrowd": 0,
"segmentation": [sum(points, [])] # flatten the list
})
ann_id += 1
# Build final JSON
coco_output = {
"info": {
"description": f"{split_name} dataset",
"version": "1.0",
"year": 2025
},
"licenses": [],
"images": images,
"annotations": annotations,
"categories": [
{"id": 1, "name": "text", "supercategory": "none"}
]
}
# Save JSON
json_path = output_dir / "annotations" / f"annotations_{split_name}.json"
with open(json_path, 'w') as jf:
json.dump(coco_output, jf, indent=2)
print(f"{split_name} COCO JSON saved to: {json_path}")
# ---- Run for all splits ----
for split in splits:
create_coco_json(splits_data[split], split)
# Customize paths based on your structure
yaml_dict = {
'train': output_dir/'images/train',
'val': output_dir/'images/val',
'test': output_dir/'images/test', # optional for evaluation
'nc': 1,
'names': ['text']
}
yaml_path = output_dir/Path("coco.yaml")
with open(yaml_path, 'w') as f:
yaml.dump(yaml_dict, f, sort_keys=False)
print(f"✅ coco.yaml saved to: {yaml_path}")
print("✅ COCO conversion complete.")