csv to manifest example

mamunsyuhada · Feb 14, 2022 · f8cd654 · f8cd654
2 parents 87c5532 + ff0b6df
commit f8cd654
Show file tree

Hide file tree

Showing 7 changed files with 377 additions and 0 deletions.
diff --git a/python/example_code/lookoutvision/csv_to_manifest.py b/python/example_code/lookoutvision/csv_to_manifest.py
@@ -0,0 +1,201 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier:  Apache-2.0
+
+"""
+Purpose
+Shows how to create an Amazon Lookout for Vision manifest file from a CSV file. 
+The CSV file format is <image location>,<anomaly classification> (normal or anomaly)
+For example:
+s3://s3bucket/circuitboard/train/anomaly/train_11.jpg,anomaly
+s3://s3bucket/circuitboard/train/normal/train_1.jpg,normal
+
+If necessary, use the bucket argument to specify the S3 bucket folder for the images.
+For more information, see https://docs.aws.amazon.com/lookout-for-vision/latest/developer-guide/ex-csv-manifest.html
+"""
+
+# snippet-start:[python.example_code.lookoutvision.Scenario_CSVtoManifest]
+
+from datetime import datetime, timezone
+import argparse
+import logging
+import csv
+import os
+
+logger = logging.getLogger(__name__)
+
+
+def check_errors(csv_file):
+    """
+    Checks for duplicate images and incorrect classifications in a CSV file. 
+    If duplicate images or invalid anomaly assignments are found, an errors CSV file
+    and deduplicated CSV file are created. Only the first 
+    occurence of a duplicate is recorded. Other duplicates are recorded in the errors file.  
+    :param csv_file: The source CSV file
+    :return: True if errors or duplicates are found, otherwise false.
+    """
+
+    logger.info(f"Checking {csv_file}.")
+
+    errors_found = False
+
+    errors_file = f"{csv_file}_errors.csv"
+    deduplicated_file = f"{csv_file}_deduplicated.csv"
+
+    # Find errors
+    with open(csv_file, 'r') as f,\
+            open(deduplicated_file, 'w') as dedup,\
+            open(errors_file, 'w') as errors:
+
+        reader = csv.reader(f,  delimiter=',')
+        dedup_writer = csv.writer(dedup)
+        error_writer = csv.writer(errors)
+        line = 1
+        entries = set()
+        for row in reader:
+
+            # Skip empty lines
+            if not ''.join(row).strip():
+                continue
+
+            # record any incorrect classifications
+            if not row[1].lower() == "normal" and not row[1].lower() == "anomaly":
+                error_writer.writerow(
+                    [line, row[0], row[1], "INVALID_CLASSIFICATION"])
+                errors_found = True
+
+            # write first image entry to dedup file and record duplicates
+            key = row[0]
+            if key not in entries:
+                dedup_writer.writerow(row)
+                entries.add(key)
+            else:
+                error_writer.writerow([line, row[0], row[1], "DUPLICATE"])
+                errors_found = True
+            line += 1
+
+    if errors_found:
+        logger.info(f"Errors found check {errors_file}.")
+    else:
+        os.remove(errors_file)
+        os.remove(deduplicated_file)
+
+    return errors_found
+
+
+def create_manifest_file(csv_file, manifest_file, s3_path):
+    """
+    Reads a CSV file and creates a Lookout for Vision classification manifest file
+    :param csv_file: The source CSV file
+    :param manifest_file: The name of the manifest file to create.
+    :param s3_path: The S3 path to the folder that contains the images.
+    """
+    logger.info(f"Processing CSV file {csv_file}.")
+
+    image_count = 0
+    anomalous_count = 0
+
+    with open(csv_file, newline='') as csvfile, open(manifest_file, "w") as output_file:
+
+        image_classifications = csv.reader(
+            csvfile, delimiter=',', quotechar='|')
+
+        # process each row (image) in CSV file
+        for row in image_classifications:
+            # Skip empty lines
+            if not ''.join(row).strip():
+                continue
+
+            source_ref = str(s3_path) + row[0]
+            classification = 0
+
+            if row[1].lower() == 'anomaly':
+                classification = 1
+                anomalous_count += 1
+
+            json_line = '{"source-ref": "' + source_ref + '",'\
+                '"anomaly-label": ' + str(classification) + ','\
+                '"anomaly-label-metadata": {' \
+                '"confidence": 1,'\
+                '"job-name": "labeling-job/anomaly-classification",'\
+                '"class-name": "' + row[1] + '",'\
+                '"human-annotated": "yes",'\
+                '"creation-date": "' + datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%f') + '",'\
+                '"type": "groundtruth/image-classification"'\
+                '}}\n'
+
+            output_file.write(json_line)
+            image_count += 1
+
+    logger.info(f"Finished creating manifest file {manifest_file}.\n"
+                f"Images: {image_count}\nAnomalous: {anomalous_count}")
+    return image_count, anomalous_count
+
+
+def add_arguments(parser):
+    """
+    Adds command line arguments to the parser.
+    :param parser: The command line parser.
+    """
+
+    parser.add_argument(
+        "csv_file", help="The CSV file that you want to process."
+    )
+
+    parser.add_argument(
+        "--s3_path",  help="The S3 bucket and folder path for the images."
+        " If not supplied, column 1 is assumed to include the S3 path.", required=False
+    )
+
+
+def main():
+
+    logging.basicConfig(level=logging.INFO,
+                        format="%(levelname)s: %(message)s")
+
+    try:
+
+        # get command line arguments
+        parser = argparse.ArgumentParser(usage=argparse.SUPPRESS)
+        add_arguments(parser)
+        args = parser.parse_args()
+        s3_path = args.s3_path
+        if s3_path is None:
+            s3_path = ""
+
+        csv_file = args.csv_file
+        manifest_file = os.path.splitext(csv_file)[0] + '.manifest'
+
+        # Create manifest file if there are no duplicate images.
+        if check_errors(csv_file):
+            print(
+                f"Issues found. Use {csv_file}_errors to view duplicates and errors.")
+            print(f"{csv_file}_deduplicated.csv contains the first occurence of a duplicate."
+                  "Update as necessary with the correct information.")
+            print(f"Re-run the script with {csv_file}_deduplicated.csv")
+        else:
+            print('No duplicates found. Creating manifest file')
+
+            image_count, anomalous_count = create_manifest_file(csv_file,
+                manifest_file,
+                s3_path)
+
+            print(f"Finished creating manifest file: {manifest_file} \n")
+
+            normal_count = image_count-anomalous_count
+            print(f"Images processed: {image_count}")
+            print(f"Normal: {normal_count}")
+            print(f"Anomalous: {anomalous_count}")
+
+    except FileNotFoundError as err:
+        logger.exception(f"File not found.:{err}")
+        print(f"File not found: {err}. Check your input CSV file")
+
+    except Exception as err:
+        logger.exception(f"An error occured:{err}")
+        print(f"An error occured:{err}")
+
+
+if __name__ == "__main__":
+    main()
+
+# snippet-end:[python.example_code.lookoutvision.Scenario_CSVtoManifest]
diff --git a/python/example_code/lookoutvision/test/test_csv_to_manifest.py b/python/example_code/lookoutvision/test/test_csv_to_manifest.py
@@ -0,0 +1,123 @@
+
+from csv_to_manifest import check_errors, create_manifest_file
+from os.path import exists
+from os import remove
+import pytest
+
+"""
+Unit tests for csv_to_manifest.py.
+"""
+
+def clean_up(*argv):
+    """
+    Deletes supplied files, if they exists
+    Ensures that no previous test run files are present.
+    """
+    for arg in argv:
+        if exists(arg):
+            remove(arg)
+
+
+
+@pytest.mark.parametrize("csv_file, result",
+    [
+        ('test/test_csvs/test_s3_supplied.csv', False)
+    ]
+)
+def test_check_no_errors(csv_file, result):
+    """
+    Confirms that valid CSV checks OK
+    """
+
+    deduplicated_file=f"{csv_file}_deduplicated.csv"
+    errors_file=f"{csv_file}_errors.csv"
+    manifest_file = f"{csv_file}_manifest"
+
+    clean_up(deduplicated_file, errors_file,manifest_file)
+
+    assert check_errors(csv_file) == result
+    assert not exists(deduplicated_file)
+    assert not exists(errors_file)
+
+
+@pytest.mark.parametrize("csv_file,result",
+    [
+        ('test/test_csvs/test_dups_errors.csv', True)
+    ]
+)
+def test_check_errors(csv_file, result):
+    """
+    Checks that a CSV file with duplications and classification
+    errors creates the deduplication and errors CSV file.
+    """
+
+    deduplicated_file=f"{csv_file}_deduplicated.csv"
+    errors_file=f"{csv_file}_errors.csv"
+    manifest_file = f"{csv_file}_manifest"
+
+    clean_up(deduplicated_file, errors_file,manifest_file)
+
+    assert check_errors(csv_file) == result
+    assert exists(deduplicated_file)
+    assert exists(errors_file)
+    assert not exists(manifest_file)
+
+    clean_up(deduplicated_file, errors_file,manifest_file)
+
+
+@pytest.mark.parametrize("csv_file,img_count,anom_count",
+    [
+        ("test/test_csvs/test_s3_supplied.csv", 9,5)
+    ]
+)
+def test_create_manifest_s3_supplied(csv_file, img_count, anom_count):
+    """
+    Checks that a CSV file with images + S3 path creates
+    a manifest file.
+    """
+
+    s3_path="s3://docexamplebucket1/circuitboard/train/"
+    deduplicated_file=f"{csv_file}_deduplicated.csv"
+    errors_file=f"{csv_file}_errors.csv"
+    manifest_file = f"{csv_file}_manifest"
+
+    clean_up(deduplicated_file, errors_file,manifest_file)
+
+    image_count, anomalous_count = create_manifest_file(csv_file,
+                manifest_file,
+                s3_path)
+    assert image_count == img_count
+    assert anomalous_count == anom_count
+    assert exists(manifest_file)
+    assert not exists(deduplicated_file)
+    assert not exists(errors_file)
+
+
+@pytest.mark.parametrize("csv_file,img_count,anom_count",
+    [
+        ('test/test_csvs/test_no_s3.csv', 7,4)
+    ]
+)
+def test_create_manifest_no_s3_supplied(csv_file,img_count, anom_count):
+    """
+    Checks that a CSV file with images without S3 path creates
+    a manifest file.
+    """
+
+    s3_path=""
+    deduplicated_file=f"{csv_file}_deduplicated.csv"
+    errors_file=f"{csv_file}_errors.csv"
+    manifest_file = f"{csv_file}_manifest"
+
+    clean_up(deduplicated_file, errors_file,manifest_file)
+
+    image_count, anomalous_count = create_manifest_file(csv_file,
+                manifest_file,
+                s3_path)
+    assert image_count == img_count
+    assert anomalous_count ==  anom_count
+    assert exists(manifest_file)
+    assert not exists(deduplicated_file)
+    assert not exists(errors_file)
+
+
diff --git a/python/example_code/lookoutvision/test/test_csvs/test_dups_errors.csv b/python/example_code/lookoutvision/test/test_csvs/test_dups_errors.csv
@@ -0,0 +1,13 @@
+s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_1.jpg,anomaly
+s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg,anomaly
+s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg,anomalous
+s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_11.jpg,anomalous
+
+s3://docexamplebucket1/circuitboard/train/normal/train-normal_1.jpg,normal
+s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg,normal
+s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg,normal
+s3://docexamplebucket1/circuitboard/train/normal/train-normal_11.jpg,correct
+
+
+
+
diff --git a/python/example_code/lookoutvision/test/test_csvs/test_no_s3.csv b/python/example_code/lookoutvision/test/test_csvs/test_no_s3.csv
@@ -0,0 +1,11 @@
+train-anomaly_1.jpg,anomaly
+train-anomaly_10.jpg,anomaly
+train-anomaly_11.jpg,anomaly
+train-anomaly_12.jpg,anomaly
+train-normal_1.jpg,normal
+train-normal_10.jpg,normal
+train-normal_11.jpg,normal
+
+
+
+
diff --git a/python/example_code/lookoutvision/test/test_csvs/test_no_s3.csv_manifest b/python/example_code/lookoutvision/test/test_csvs/test_no_s3.csv_manifest
@@ -0,0 +1,7 @@
+{"source-ref": "train-anomaly_1.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435927","type": "groundtruth/image-classification"}}
+{"source-ref": "train-anomaly_10.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435934","type": "groundtruth/image-classification"}}
+{"source-ref": "train-anomaly_11.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435938","type": "groundtruth/image-classification"}}
+{"source-ref": "train-anomaly_12.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435941","type": "groundtruth/image-classification"}}
+{"source-ref": "train-normal_1.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435945","type": "groundtruth/image-classification"}}
+{"source-ref": "train-normal_10.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435949","type": "groundtruth/image-classification"}}
+{"source-ref": "train-normal_11.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435952","type": "groundtruth/image-classification"}}
diff --git a/python/example_code/lookoutvision/test/test_csvs/test_s3_supplied.csv b/python/example_code/lookoutvision/test/test_csvs/test_s3_supplied.csv
@@ -0,0 +1,13 @@
+s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_1.jpg,anomaly
+s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg,anomaly
+s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_11.jpg,anomaly
+s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_12.jpg,anomaly
+s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_13.jpg,anomaly
+s3://docexamplebucket1/circuitboard/train/normal/train-normal_1.jpg,normal
+s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg,normal
+s3://docexamplebucket1/circuitboard/train/normal/train-normal_11.jpg,normal
+s3://docexamplebucket1/circuitboard/train/normal/train-normal_12.jpg,normal
+
+
+
+
diff --git a/python/example_code/lookoutvision/test/test_csvs/test_s3_supplied.csv_manifest b/python/example_code/lookoutvision/test/test_csvs/test_s3_supplied.csv_manifest
@@ -0,0 +1,9 @@
+{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_1.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434742","type": "groundtruth/image-classification"}}
+{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434766","type": "groundtruth/image-classification"}}
+{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_11.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434771","type": "groundtruth/image-classification"}}
+{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_12.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434775","type": "groundtruth/image-classification"}}
+{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_13.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434779","type": "groundtruth/image-classification"}}
+{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_1.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434783","type": "groundtruth/image-classification"}}
+{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434787","type": "groundtruth/image-classification"}}
+{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_11.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434791","type": "groundtruth/image-classification"}}
+{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_12.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434794","type": "groundtruth/image-classification"}}