Divide Label Studio YOLO annotation files into training and validation sets

Label Studio is an open-source data-labelling platform used to prepare training data for machine learning models.

Label Studio facilitates the manual labelling of data, via a web interface. Once the task is complete, the data can be exported into a range of formats, including YOLO. The resultant output will consist of an “images” folder and a “labels” folder. Each image corresponds to a label file in .txt format.

When training machine learning algorithms, there is a requirement to split this collection of labelled images into a “training” and “validation” set. Typically, 90% of the images go into the “training” set and 10% go into “validation”

The following Python code will take a folder of Label Studio images and split them into a “training” and “validation” set, in a ratio that you choose.


Here is the Python code:

#!/usr/bin/env python

import os, shutil, random, argparse

def options():
	parser = argparse.ArgumentParser(description="Return a recursive list of files that match a criterion")
	parser.add_argument("-f", "--folder", help="Target folder of images.", required=True)
	parser.add_argument("-e", "--extension", help="File extension of images", required=True)
	parser.add_argument("-s", "--split", help="Split percentage between test and validate (typically 90)", required=True)
	parser.add_argument("-r", "--seed", help="Set seed for randomisation", required=False)
	args = parser.parse_args()
	return args

def main():

	# Get options
	args = options()
	# Preparing the folder structure
	folder = args.folder
	folder_images = folder + "/images/"
	folder_labels = folder + "/labels/"
	ext = args.extension

	# Add leading dot to file extension if missing
	if ext[0]!=".":
		ext="." + ext
	print("Images are located at", folder_images, "\nLabels are located at", folder_labels)

	# Prepare to create new folders
	training_images_path = folder_images + 'training/'
	validation_images_path = folder_images + 'validation/'
	training_labels_path = folder_labels + 'training/'
	validation_labels_path = folder_labels +'validation/'

	# Create new folders even if folders already exist
	if os.path.exists(training_images_path):
	if os.path.exists(validation_images_path):
	if os.path.exists(training_labels_path):
	if os.path.exists(validation_labels_path):

	# Collect split percentage
	split_percentage = int(args.split)

	# Create list to sort files randomly
	files = []
	ext_len = len(ext)

	for _, _, f in os.walk(folder):
		for file in f:
			if file.endswith(ext):
				strip = file[0:len(file) - ext_len]      

	# Randomise files, using seed (if supplied)
	if args.seed is not None:
		seed = int(args.seed)
		seed = int(100)

	size = len(files)                   
	split = int(split_percentage * size / 100)

	print("Moving training data...")
	for i in range(0, split):
		strip = files[i]
		img = strip + ext
		src_img = folder_images + img
		dst_img = training_images_path
		shutil.move(src_img, dst_img) # Move files

		annot = strip + ".txt"
		src_label = folder_labels + annot
		dst = training_labels_path + annot
		shutil.move(src_label, dst) 

	print("Moving validation data...")
	for i in range(split, size):
		strip = files[i]

		img = strip + ext
		src_img = folder_images + img
		dst_img = validation_images_path + img
		shutil.move(src_img, dst_img) # Move files

		annot = strip + ".txt"
		src_label = folder_labels + annot
		dst_label = validation_labels_path + annot
		shutil.move(src_label, dst_label) # Move files


if __name__ == '__main__':


Usage of the script requires the following inputs:

Example usage: -f /home/amdimech/path/to/images/ -e .png -s 90 -r 1026


The full code can be acccessed on GitHub Gist.



