Skip to content

Commit

Permalink
csv to manifest example
Browse files Browse the repository at this point in the history
  • Loading branch information
AWSChris committed Feb 14, 2022
2 parents 87c5532 + ff0b6df commit f8cd654
Show file tree
Hide file tree
Showing 7 changed files with 377 additions and 0 deletions.
201 changes: 201 additions & 0 deletions python/example_code/lookoutvision/csv_to_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Purpose
Shows how to create an Amazon Lookout for Vision manifest file from a CSV file.
The CSV file format is <image location>,<anomaly classification> (normal or anomaly)
For example:
s3://s3bucket/circuitboard/train/anomaly/train_11.jpg,anomaly
s3://s3bucket/circuitboard/train/normal/train_1.jpg,normal
If necessary, use the bucket argument to specify the S3 bucket folder for the images.
For more information, see https://docs.aws.amazon.com/lookout-for-vision/latest/developer-guide/ex-csv-manifest.html
"""

# snippet-start:[python.example_code.lookoutvision.Scenario_CSVtoManifest]

from datetime import datetime, timezone
import argparse
import logging
import csv
import os

logger = logging.getLogger(__name__)


def check_errors(csv_file):
"""
Checks for duplicate images and incorrect classifications in a CSV file.
If duplicate images or invalid anomaly assignments are found, an errors CSV file
and deduplicated CSV file are created. Only the first
occurence of a duplicate is recorded. Other duplicates are recorded in the errors file.
:param csv_file: The source CSV file
:return: True if errors or duplicates are found, otherwise false.
"""

logger.info(f"Checking {csv_file}.")

errors_found = False

errors_file = f"{csv_file}_errors.csv"
deduplicated_file = f"{csv_file}_deduplicated.csv"

# Find errors
with open(csv_file, 'r') as f,\
open(deduplicated_file, 'w') as dedup,\
open(errors_file, 'w') as errors:

reader = csv.reader(f, delimiter=',')
dedup_writer = csv.writer(dedup)
error_writer = csv.writer(errors)
line = 1
entries = set()
for row in reader:

# Skip empty lines
if not ''.join(row).strip():
continue

# record any incorrect classifications
if not row[1].lower() == "normal" and not row[1].lower() == "anomaly":
error_writer.writerow(
[line, row[0], row[1], "INVALID_CLASSIFICATION"])
errors_found = True

# write first image entry to dedup file and record duplicates
key = row[0]
if key not in entries:
dedup_writer.writerow(row)
entries.add(key)
else:
error_writer.writerow([line, row[0], row[1], "DUPLICATE"])
errors_found = True
line += 1

if errors_found:
logger.info(f"Errors found check {errors_file}.")
else:
os.remove(errors_file)
os.remove(deduplicated_file)

return errors_found


def create_manifest_file(csv_file, manifest_file, s3_path):
"""
Reads a CSV file and creates a Lookout for Vision classification manifest file
:param csv_file: The source CSV file
:param manifest_file: The name of the manifest file to create.
:param s3_path: The S3 path to the folder that contains the images.
"""
logger.info(f"Processing CSV file {csv_file}.")

image_count = 0
anomalous_count = 0

with open(csv_file, newline='') as csvfile, open(manifest_file, "w") as output_file:

image_classifications = csv.reader(
csvfile, delimiter=',', quotechar='|')

# process each row (image) in CSV file
for row in image_classifications:
# Skip empty lines
if not ''.join(row).strip():
continue

source_ref = str(s3_path) + row[0]
classification = 0

if row[1].lower() == 'anomaly':
classification = 1
anomalous_count += 1

json_line = '{"source-ref": "' + source_ref + '",'\
'"anomaly-label": ' + str(classification) + ','\
'"anomaly-label-metadata": {' \
'"confidence": 1,'\
'"job-name": "labeling-job/anomaly-classification",'\
'"class-name": "' + row[1] + '",'\
'"human-annotated": "yes",'\
'"creation-date": "' + datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%f') + '",'\
'"type": "groundtruth/image-classification"'\
'}}\n'

output_file.write(json_line)
image_count += 1

logger.info(f"Finished creating manifest file {manifest_file}.\n"
f"Images: {image_count}\nAnomalous: {anomalous_count}")
return image_count, anomalous_count


def add_arguments(parser):
"""
Adds command line arguments to the parser.
:param parser: The command line parser.
"""

parser.add_argument(
"csv_file", help="The CSV file that you want to process."
)

parser.add_argument(
"--s3_path", help="The S3 bucket and folder path for the images."
" If not supplied, column 1 is assumed to include the S3 path.", required=False
)


def main():

logging.basicConfig(level=logging.INFO,
format="%(levelname)s: %(message)s")

try:

# get command line arguments
parser = argparse.ArgumentParser(usage=argparse.SUPPRESS)
add_arguments(parser)
args = parser.parse_args()
s3_path = args.s3_path
if s3_path is None:
s3_path = ""

csv_file = args.csv_file
manifest_file = os.path.splitext(csv_file)[0] + '.manifest'

# Create manifest file if there are no duplicate images.
if check_errors(csv_file):
print(
f"Issues found. Use {csv_file}_errors to view duplicates and errors.")
print(f"{csv_file}_deduplicated.csv contains the first occurence of a duplicate."
"Update as necessary with the correct information.")
print(f"Re-run the script with {csv_file}_deduplicated.csv")
else:
print('No duplicates found. Creating manifest file')

image_count, anomalous_count = create_manifest_file(csv_file,
manifest_file,
s3_path)

print(f"Finished creating manifest file: {manifest_file} \n")

normal_count = image_count-anomalous_count
print(f"Images processed: {image_count}")
print(f"Normal: {normal_count}")
print(f"Anomalous: {anomalous_count}")

except FileNotFoundError as err:
logger.exception(f"File not found.:{err}")
print(f"File not found: {err}. Check your input CSV file")

except Exception as err:
logger.exception(f"An error occured:{err}")
print(f"An error occured:{err}")


if __name__ == "__main__":
main()

# snippet-end:[python.example_code.lookoutvision.Scenario_CSVtoManifest]
123 changes: 123 additions & 0 deletions python/example_code/lookoutvision/test/test_csv_to_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@

from csv_to_manifest import check_errors, create_manifest_file
from os.path import exists
from os import remove
import pytest

"""
Unit tests for csv_to_manifest.py.
"""

def clean_up(*argv):
"""
Deletes supplied files, if they exists
Ensures that no previous test run files are present.
"""
for arg in argv:
if exists(arg):
remove(arg)



@pytest.mark.parametrize("csv_file, result",
[
('test/test_csvs/test_s3_supplied.csv', False)
]
)
def test_check_no_errors(csv_file, result):
"""
Confirms that valid CSV checks OK
"""

deduplicated_file=f"{csv_file}_deduplicated.csv"
errors_file=f"{csv_file}_errors.csv"
manifest_file = f"{csv_file}_manifest"

clean_up(deduplicated_file, errors_file,manifest_file)

assert check_errors(csv_file) == result
assert not exists(deduplicated_file)
assert not exists(errors_file)


@pytest.mark.parametrize("csv_file,result",
[
('test/test_csvs/test_dups_errors.csv', True)
]
)
def test_check_errors(csv_file, result):
"""
Checks that a CSV file with duplications and classification
errors creates the deduplication and errors CSV file.
"""

deduplicated_file=f"{csv_file}_deduplicated.csv"
errors_file=f"{csv_file}_errors.csv"
manifest_file = f"{csv_file}_manifest"

clean_up(deduplicated_file, errors_file,manifest_file)

assert check_errors(csv_file) == result
assert exists(deduplicated_file)
assert exists(errors_file)
assert not exists(manifest_file)

clean_up(deduplicated_file, errors_file,manifest_file)


@pytest.mark.parametrize("csv_file,img_count,anom_count",
[
("test/test_csvs/test_s3_supplied.csv", 9,5)
]
)
def test_create_manifest_s3_supplied(csv_file, img_count, anom_count):
"""
Checks that a CSV file with images + S3 path creates
a manifest file.
"""

s3_path="s3://docexamplebucket1/circuitboard/train/"
deduplicated_file=f"{csv_file}_deduplicated.csv"
errors_file=f"{csv_file}_errors.csv"
manifest_file = f"{csv_file}_manifest"

clean_up(deduplicated_file, errors_file,manifest_file)

image_count, anomalous_count = create_manifest_file(csv_file,
manifest_file,
s3_path)
assert image_count == img_count
assert anomalous_count == anom_count
assert exists(manifest_file)
assert not exists(deduplicated_file)
assert not exists(errors_file)


@pytest.mark.parametrize("csv_file,img_count,anom_count",
[
('test/test_csvs/test_no_s3.csv', 7,4)
]
)
def test_create_manifest_no_s3_supplied(csv_file,img_count, anom_count):
"""
Checks that a CSV file with images without S3 path creates
a manifest file.
"""

s3_path=""
deduplicated_file=f"{csv_file}_deduplicated.csv"
errors_file=f"{csv_file}_errors.csv"
manifest_file = f"{csv_file}_manifest"

clean_up(deduplicated_file, errors_file,manifest_file)

image_count, anomalous_count = create_manifest_file(csv_file,
manifest_file,
s3_path)
assert image_count == img_count
assert anomalous_count == anom_count
assert exists(manifest_file)
assert not exists(deduplicated_file)
assert not exists(errors_file)


Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_1.jpg,anomaly
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg,anomaly
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg,anomalous
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_11.jpg,anomalous

s3://docexamplebucket1/circuitboard/train/normal/train-normal_1.jpg,normal
s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg,normal
s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg,normal
s3://docexamplebucket1/circuitboard/train/normal/train-normal_11.jpg,correct




11 changes: 11 additions & 0 deletions python/example_code/lookoutvision/test/test_csvs/test_no_s3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
train-anomaly_1.jpg,anomaly
train-anomaly_10.jpg,anomaly
train-anomaly_11.jpg,anomaly
train-anomaly_12.jpg,anomaly
train-normal_1.jpg,normal
train-normal_10.jpg,normal
train-normal_11.jpg,normal




Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{"source-ref": "train-anomaly_1.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435927","type": "groundtruth/image-classification"}}
{"source-ref": "train-anomaly_10.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435934","type": "groundtruth/image-classification"}}
{"source-ref": "train-anomaly_11.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435938","type": "groundtruth/image-classification"}}
{"source-ref": "train-anomaly_12.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435941","type": "groundtruth/image-classification"}}
{"source-ref": "train-normal_1.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435945","type": "groundtruth/image-classification"}}
{"source-ref": "train-normal_10.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435949","type": "groundtruth/image-classification"}}
{"source-ref": "train-normal_11.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435952","type": "groundtruth/image-classification"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_1.jpg,anomaly
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg,anomaly
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_11.jpg,anomaly
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_12.jpg,anomaly
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_13.jpg,anomaly
s3://docexamplebucket1/circuitboard/train/normal/train-normal_1.jpg,normal
s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg,normal
s3://docexamplebucket1/circuitboard/train/normal/train-normal_11.jpg,normal
s3://docexamplebucket1/circuitboard/train/normal/train-normal_12.jpg,normal




Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_1.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434742","type": "groundtruth/image-classification"}}
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434766","type": "groundtruth/image-classification"}}
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_11.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434771","type": "groundtruth/image-classification"}}
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_12.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434775","type": "groundtruth/image-classification"}}
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_13.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434779","type": "groundtruth/image-classification"}}
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_1.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434783","type": "groundtruth/image-classification"}}
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434787","type": "groundtruth/image-classification"}}
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_11.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434791","type": "groundtruth/image-classification"}}
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_12.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434794","type": "groundtruth/image-classification"}}

0 comments on commit f8cd654

Please sign in to comment.