Divide Label Studio YOLO annotation files into training and validation sets
Label Studio is an open-source data-labelling platform used to prepare training data for machine learning models.
Label Studio facilitates the manual labelling of data, via a web interface. Once the task is complete, the data can be exported into a range of formats, including YOLO. The resultant output will consist of an “images” folder and a “labels” folder. Each image corresponds to a label file in .txt format.
When training machine learning algorithms, there is a requirement to split this collection of labelled images into a “training” and “validation” set. Typically, 90% of the images go into the “training” set and 10% go into “validation”
The following Python code will take a folder of Label Studio images and split them into a “training” and “validation” set, in a ratio that you choose.
Here is the Python code:
#!/usr/bin/env python
import os, shutil, random, argparse
def options():
parser = argparse.ArgumentParser(description="Return a recursive list of files that match a criterion")
parser.add_argument("-f", "--folder", help="Target folder of images.", required=True)
parser.add_argument("-e", "--extension", help="File extension of images", required=True)
parser.add_argument("-s", "--split", help="Split percentage between test and validate (typically 90)", required=True)
parser.add_argument("-r", "--seed", help="Set seed for randomisation", required=False)
args = parser.parse_args()
return args
def main():
# Get options
args = options()
# Preparing the folder structure
folder = args.folder
folder_images = folder + "/images/"
folder_labels = folder + "/labels/"
ext = args.extension
# Add leading dot to file extension if missing
if ext[0]!=".":
ext="." + ext
print("Images are located at", folder_images, "\nLabels are located at", folder_labels)
# Prepare to create new folders
training_images_path = folder_images + 'training/'
validation_images_path = folder_images + 'validation/'
training_labels_path = folder_labels + 'training/'
validation_labels_path = folder_labels +'validation/'
# Create new folders even if folders already exist
if os.path.exists(training_images_path):
if os.path.exists(validation_images_path):
if os.path.exists(training_labels_path):
if os.path.exists(validation_labels_path):
# Collect split percentage
split_percentage = int(args.split)
# Create list to sort files randomly
files = []
ext_len = len(ext)
for _, _, f in os.walk(folder):
for file in f:
if file.endswith(ext):
strip = file[0:len(file) - ext_len]
# Randomise files, using seed (if supplied)
if args.seed is not None:
seed = int(args.seed)
seed = int(100)
size = len(files)
split = int(split_percentage * size / 100)
print("Moving training data...")
for i in range(0, split):
strip = files[i]
img = strip + ext
src_img = folder_images + img
dst_img = training_images_path
shutil.move(src_img, dst_img) # Move files
annot = strip + ".txt"
src_label = folder_labels + annot
dst = training_labels_path + annot
shutil.move(src_label, dst)
print("Moving validation data...")
for i in range(split, size):
strip = files[i]
img = strip + ext
src_img = folder_images + img
dst_img = validation_images_path + img
shutil.move(src_img, dst_img) # Move files
annot = strip + ".txt"
src_label = folder_labels + annot
dst_label = validation_labels_path + annot
shutil.move(src_label, dst_label) # Move files
if __name__ == '__main__':
Usage of the script requires the following inputs:
- -f: Parent folder containing the Label Studio images and labels folders (string, required)
- -e: The file extension of the images (string, required)
- -s: The split percentage between testing and validation (integer, typically 90, required).
- -r: A seed to facilitate randomisation (integer, optional)
Example usage:
split-images.py -f /home/amdimech/path/to/images/ -e .png -s 90 -r 1026
The full code can be acccessed on GitHub Gist.
