Skip to content

Commit

Permalink
Fix predefined sorting for task data (cvat-ai#5083)
Browse files Browse the repository at this point in the history
Fixes cvat-ai#5061, cvat-ai#4179

- Added a way to declare custom file ordering for the local task data
uploads via TUS protocol
- Added an option to use a manifest to support the `predefined` sorting
method
- This file is required for the `predefined` sorting mode with image
archives
- Fixed file ordering when tasks are created from SDK or CLI in the
`predefined` sorting mode
- Added more tests for task data uploading API

The uploading protocol is implemented:

The user specifies `sorting_method=predefined` if the task creation
request. Then the data is uploaded.

1. Client files uploading
1.1. The files are uploaded as separate files (using the TUS protocol)
or grouped files (using the `Upload-Multiple` requests).
1.2. The `Upload-Finish` request comes (or its unlabeled legacy
equivalent). The new optional field can be supplied: `upload_file_order`
- a list of strings. It allows to override the input file order, if
necessary, and is only valid with the `predefined` sorting method
specified.
1.2.1. If the field is empty or missing, the client files in the data
requests are considered ordered.
1.2.2. If the field is not empty, a list containing the file list in the
required order is expected in the `upload_file_order` field.
1.2.2.1. If there are `client_files` in the request, the files are
sorted
1.2.2.2. If file lists mismatch, an explanatory error is raised.

2. Data processing
2.1. At this point, all `*_files` are considered ordered as requested.
2.2. Require a metafile for zip uploads with predefined sorting. The
file is expected to accompany the uploaded zip file, not to be inside of
the archive.
2.3. If there is a metafile in the input data, files are ordered after
the metafile.
2.3.1. If the data is extracted from cloud, only the specified subset of
the files is kept in the manifest.
2.3.2. If the upload data doesn't exist in the metafile, an error is
raised.
2.3.3. A `job_file_mapping` has higher priority than metafile, if
specified.

Co-authored-by: Roman Donchenko <[email protected]>
  • Loading branch information
zhiltsov-max and SpecLad authored Jun 8, 2023
1 parent 04a1f0b commit 8df8872
Show file tree
Hide file tree
Showing 22 changed files with 1,234 additions and 210 deletions.
2 changes: 2 additions & 0 deletions .remarkignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
cvat-sdk/docs/
cvat-sdk/README.md
.env/
site/themes/
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## \[2.5.0] - Unreleased
### Added
- \[Server API\] An option to supply custom file ordering for task data uploads (<https://github.com/opencv/cvat/pull/5083>)

### Changed
- Allowed to use dataset manifest for the `predefined` sorting method for task data (<https://github.com/opencv/cvat/pull/5083>)
- TBD

### Changed
Expand All @@ -25,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Deletion of uploaded files, including annotations and backups,
after they have been uploaded to the server using the TUS protocol but before an RQ job has been initiated. (<https://github.com/opencv/cvat/pull/5909>)
- Simultaneous creation of tasks or projects with identical names from backups by multiple users.(<https://github.com/opencv/cvat/pull/5909>)
- \[Server API\] The `predefined` sorting method for task data uploads (<https://github.com/opencv/cvat/pull/5083>)

### Security
- TDB
Expand Down
7 changes: 5 additions & 2 deletions cvat-sdk/cvat_sdk/core/uploading.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2022 CVAT.ai Corporation
# Copyright (C) 2022-2023 CVAT.ai Corporation
#
# SPDX-License-Identifier: MIT

Expand Down Expand Up @@ -350,6 +350,10 @@ def upload_files(
if pbar is not None:
pbar.start(total_size, desc="Uploading data")

if str(kwargs.get("sorting_method")).lower() == "predefined":
# Request file ordering, because we reorder files to send more efficiently
kwargs.setdefault("upload_file_order", [p.name for p in resources])

self._tus_start_upload(url)

for group, group_size in bulk_file_groups:
Expand All @@ -374,7 +378,6 @@ def upload_files(
pbar.advance(group_size)

for filename in separate_files:
# TODO: check if basename produces invalid paths here, can lead to overwriting
self._upload_file_data_with_tus(
url,
filename,
Expand Down
18 changes: 15 additions & 3 deletions cvat/apps/engine/media_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,12 +381,24 @@ def get_zip_filename(self):

def get_path(self, i):
if self._zip_source.filename:
return os.path.join(os.path.dirname(self._zip_source.filename), self._source_path[i]) \
if not self.extract_dir else os.path.join(self.extract_dir, self._source_path[i])
prefix = self._get_extract_prefix()
return os.path.join(prefix, self._source_path[i])
else: # necessary for mime_type definition
return self._source_path[i]

def __contains__(self, media_file):
return super().__contains__(os.path.relpath(media_file, self._get_extract_prefix()))

def _get_extract_prefix(self):
return self.extract_dir or os.path.dirname(self._zip_source.filename)

def reconcile(self, source_files, step=1, start=0, stop=None, dimension=DimensionType.DIM_2D, sorting_method=None):
if source_files:
# file list is expected to be a processed output of self.get_path()
# which returns files with the output directory prefix
prefix = self._get_extract_prefix()
source_files = [os.path.relpath(fn, prefix) for fn in source_files]

super().reconcile(
source_files=source_files,
step=step,
Expand All @@ -397,7 +409,7 @@ def reconcile(self, source_files, step=1, start=0, stop=None, dimension=Dimensio
)

def extract(self):
self._zip_source.extractall(self.extract_dir if self.extract_dir else os.path.dirname(self._zip_source.filename))
self._zip_source.extractall(self._get_extract_prefix())
if not self.extract_dir:
os.remove(self._zip_source.filename)

Expand Down
37 changes: 37 additions & 0 deletions cvat/apps/engine/migrations/0068_auto_20230418_0901.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Generated by Django 3.2.18 on 2023-04-18 09:01

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('engine', '0067_alter_cloudstorage_credentials_type'),
]

operations = [
migrations.AlterModelOptions(
name='clientfile',
options={'default_permissions': (), 'ordering': ('id',)},
),
migrations.AlterModelOptions(
name='relatedfile',
options={'default_permissions': (), 'ordering': ('id',)},
),
migrations.AlterModelOptions(
name='remotefile',
options={'default_permissions': (), 'ordering': ('id',)},
),
migrations.AlterModelOptions(
name='serverfile',
options={'default_permissions': (), 'ordering': ('id',)},
),
migrations.AlterUniqueTogether(
name='remotefile',
unique_together={('data', 'file')},
),
migrations.AlterUniqueTogether(
name='serverfile',
unique_together={('data', 'file')},
),
]
75 changes: 60 additions & 15 deletions cvat/apps/engine/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import base64
import json
import os
import os.path
import uuid
from dataclasses import asdict, dataclass
from distutils.util import strtobool
Expand Down Expand Up @@ -151,9 +152,30 @@ def __init__(self, request):
self.size = int(request.META.get("CONTENT_LENGTH", settings.TUS_DEFAULT_CHUNK_SIZE))
self.content = request.body

# This upload mixin is implemented using tus
# tus is open protocol for file uploads (see more https://tus.io/)
class UploadMixin:
"""
Implements file uploads to the server. Allows to upload single and multiple files, suspend
and resume uploading. Uses the TUS open file uploading protocol (https://tus.io/).
Implements the following protocols:
a. A single Data request
and
b.1. An Upload-Start request
b.2.a. The regular TUS protocol requests (Upload-Length + Chunks)
b.2.b. Upload-Multiple requests
b.3. An Upload-Finish request
Requests:
- Data - POST, no extra headers or 'Upload-Start' + 'Upload-Finish' headers
- Upload-Start - POST, has an 'Upload-Start' header
- Upload-Length - POST, has an 'Upload-Length' header (read the TUS protocol)
- Chunk - HEAD/PATCH (read the TUS protocol)
- Upload-Finish - POST, has an 'Upload-Finish' header
- Upload-Multiple - POST, has a 'Upload-Multiple' header
"""

_tus_api_version = '1.0.0'
_tus_api_version_supported = ['1.0.0']
_tus_api_extensions = []
Expand Down Expand Up @@ -204,11 +226,11 @@ def upload_data(self, request):
if one_request_upload or finish_upload:
return self.upload_finished(request)
elif start_upload:
return Response(status=status.HTTP_202_ACCEPTED)
return self.upload_started(request)
elif tus_request:
return self.init_tus_upload(request)
elif bulk_file_upload:
return self.append(request)
return self.append_files(request)
else: # backward compatibility case - no upload headers were found
return self.upload_finished(request)

Expand All @@ -218,7 +240,7 @@ def init_tus_upload(self, request):
else:
metadata = self._get_metadata(request)
filename = metadata.get('filename', '')
if not self.validate_filename(filename):
if not self.is_valid_uploaded_file_name(filename):
return self._tus_response(status=status.HTTP_400_BAD_REQUEST,
data="File name {} is not allowed".format(filename))

Expand Down Expand Up @@ -310,32 +332,55 @@ def append_tus_chunk(self, request, file_id):
extra_headers={'Upload-Offset': tus_file.offset,
'Upload-Filename': tus_file.filename})

def validate_filename(self, filename):
def is_valid_uploaded_file_name(self, filename: str) -> bool:
"""
Checks the file name to be valid.
Returns True if the filename is valid, otherwise returns False.
"""

upload_dir = self.get_upload_dir()
file_path = os.path.join(upload_dir, filename)
return os.path.commonprefix((os.path.realpath(file_path), upload_dir)) == upload_dir

def get_upload_dir(self):
def get_upload_dir(self) -> str:
return self._object.data.get_upload_dirname()

def get_request_client_files(self, request):
def _get_request_client_files(self, request):
serializer = DataSerializer(self._object, data=request.data)
serializer.is_valid(raise_exception=True)
data = {k: v for k, v in serializer.validated_data.items()}
return data.get('client_files', None)
return serializer.validated_data.get('client_files')

def append(self, request):
client_files = self.get_request_client_files(request)
def append_files(self, request):
"""
Processes a single or multiple files sent in a single request inside
a file uploading session.
"""

client_files = self._get_request_client_files(request)
if client_files:
upload_dir = self.get_upload_dir()
for client_file in client_files:
with open(os.path.join(upload_dir, client_file['file'].name), 'ab+') as destination:
filename = client_file['file'].name
if not self.is_valid_uploaded_file_name(filename):
return Response(status=status.HTTP_400_BAD_REQUEST,
data=f"File name {filename} is not allowed", content_type="text/plain")

with open(os.path.join(upload_dir, filename), 'ab+') as destination:
destination.write(client_file['file'].read())
return Response(status=status.HTTP_200_OK)

# override this to do stuff after upload
def upload_started(self, request):
"""
Allows to do actions before upcoming file uploading.
"""
return Response(status=status.HTTP_202_ACCEPTED)

def upload_finished(self, request):
raise NotImplementedError('You need to implement upload_finished in UploadMixin')
"""
Allows to process uploaded files.
"""

raise NotImplementedError('Must be implemented in the derived class')

class AnnotationMixin:
def export_annotations(self, request, db_obj, export_func, callback, get_data=None):
Expand Down
23 changes: 18 additions & 5 deletions cvat/apps/engine/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,6 @@ def make_dirs(self):
os.makedirs(self.get_original_cache_dirname())
os.makedirs(self.get_upload_dirname())

def get_uploaded_files(self):
upload_dir = self.get_upload_dirname()
uploaded_files = [os.path.join(upload_dir, file) for file in os.listdir(upload_dir) if os.path.isfile(os.path.join(upload_dir, file))]
represented_files = [{'file':f} for f in uploaded_files]
return represented_files

class Video(models.Model):
data = models.OneToOneField(Data, on_delete=models.CASCADE, related_name="video", null=True)
Expand Down Expand Up @@ -424,13 +419,22 @@ class Meta:
default_permissions = ()
unique_together = ("data", "file")

# Some DBs can shuffle the rows. Here we restore the insertion order.
# https://github.com/opencv/cvat/pull/5083#discussion_r1038032715
ordering = ('id', )

# For server files on the mounted share
class ServerFile(models.Model):
data = models.ForeignKey(Data, on_delete=models.CASCADE, null=True, related_name='server_files')
file = models.CharField(max_length=1024)

class Meta:
default_permissions = ()
unique_together = ("data", "file")

# Some DBs can shuffle the rows. Here we restore the insertion order.
# https://github.com/opencv/cvat/pull/5083#discussion_r1038032715
ordering = ('id', )

# For URLs
class RemoteFile(models.Model):
Expand All @@ -439,6 +443,11 @@ class RemoteFile(models.Model):

class Meta:
default_permissions = ()
unique_together = ("data", "file")

# Some DBs can shuffle the rows. Here we restore the insertion order.
# https://github.com/opencv/cvat/pull/5083#discussion_r1038032715
ordering = ('id', )


class RelatedFile(models.Model):
Expand All @@ -451,6 +460,10 @@ class Meta:
default_permissions = ()
unique_together = ("data", "path")

# Some DBs can shuffle the rows. Here we restore the insertion order.
# https://github.com/opencv/cvat/pull/5083#discussion_r1038032715
ordering = ('id', )

class Segment(models.Model):
task = models.ForeignKey(Task, on_delete=models.CASCADE)
start_frame = models.IntegerField()
Expand Down
34 changes: 28 additions & 6 deletions cvat/apps/engine/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,12 +763,33 @@ class DataSerializer(serializers.ModelSerializer):
"""))
job_file_mapping = JobFileMapping(required=False, write_only=True)

upload_file_order = serializers.ListField(
child=serializers.CharField(max_length=1024),
default=list, allow_empty=True, write_only=True,
help_text=textwrap.dedent("""\
Allows to specify file order for client_file uploads.
Only valid with the "{}" sorting method selected.
To state that the input files are sent in the correct order,
pass an empty list.
If you want to send files in an arbitrary order
and reorder them afterwards on the server,
pass the list of file names in the required order.
""".format(models.SortingMethod.PREDEFINED))
)

class Meta:
model = models.Data
fields = ('chunk_size', 'size', 'image_quality', 'start_frame', 'stop_frame', 'frame_filter',
'compressed_chunk_type', 'original_chunk_type', 'client_files', 'server_files', 'server_files_exclude','remote_files', 'use_zip_chunks',
'cloud_storage_id', 'use_cache', 'copy_data', 'storage_method', 'storage', 'sorting_method', 'filename_pattern',
'job_file_mapping')
fields = (
'chunk_size', 'size', 'image_quality', 'start_frame', 'stop_frame', 'frame_filter',
'compressed_chunk_type', 'original_chunk_type',
'client_files', 'server_files', 'remote_files',
'use_zip_chunks', 'server_files_exclude',
'cloud_storage_id', 'use_cache', 'copy_data', 'storage_method',
'storage', 'sorting_method', 'filename_pattern',
'job_file_mapping', 'upload_file_order',
)
extra_kwargs = {
'chunk_size': { 'help_text': "Maximum number of frames per chunk" },
'size': { 'help_text': "The number of frames" },
Expand Down Expand Up @@ -858,8 +879,9 @@ def _pop_data(self, validated_data):
server_files = validated_data.pop('server_files')
remote_files = validated_data.pop('remote_files')

validated_data.pop('job_file_mapping', None) # optional
validated_data.pop('server_files_exclude', None) # optional
validated_data.pop('job_file_mapping', None) # optional, not present in Data
validated_data.pop('upload_file_order', None) # optional, not present in Data
validated_data.pop('server_files_exclude', None) # optional, not present in Data

for extra_key in { 'use_zip_chunks', 'use_cache', 'copy_data' }:
validated_data.pop(extra_key)
Expand Down
Loading

0 comments on commit 8df8872

Please sign in to comment.