forked from awsdocs/aws-doc-sdk-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
377 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
""" | ||
Purpose | ||
Shows how to create an Amazon Lookout for Vision manifest file from a CSV file. | ||
The CSV file format is <image location>,<anomaly classification> (normal or anomaly) | ||
For example: | ||
s3://s3bucket/circuitboard/train/anomaly/train_11.jpg,anomaly | ||
s3://s3bucket/circuitboard/train/normal/train_1.jpg,normal | ||
If necessary, use the bucket argument to specify the S3 bucket folder for the images. | ||
For more information, see https://docs.aws.amazon.com/lookout-for-vision/latest/developer-guide/ex-csv-manifest.html | ||
""" | ||
|
||
# snippet-start:[python.example_code.lookoutvision.Scenario_CSVtoManifest] | ||
|
||
from datetime import datetime, timezone | ||
import argparse | ||
import logging | ||
import csv | ||
import os | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def check_errors(csv_file): | ||
""" | ||
Checks for duplicate images and incorrect classifications in a CSV file. | ||
If duplicate images or invalid anomaly assignments are found, an errors CSV file | ||
and deduplicated CSV file are created. Only the first | ||
occurence of a duplicate is recorded. Other duplicates are recorded in the errors file. | ||
:param csv_file: The source CSV file | ||
:return: True if errors or duplicates are found, otherwise false. | ||
""" | ||
|
||
logger.info(f"Checking {csv_file}.") | ||
|
||
errors_found = False | ||
|
||
errors_file = f"{csv_file}_errors.csv" | ||
deduplicated_file = f"{csv_file}_deduplicated.csv" | ||
|
||
# Find errors | ||
with open(csv_file, 'r') as f,\ | ||
open(deduplicated_file, 'w') as dedup,\ | ||
open(errors_file, 'w') as errors: | ||
|
||
reader = csv.reader(f, delimiter=',') | ||
dedup_writer = csv.writer(dedup) | ||
error_writer = csv.writer(errors) | ||
line = 1 | ||
entries = set() | ||
for row in reader: | ||
|
||
# Skip empty lines | ||
if not ''.join(row).strip(): | ||
continue | ||
|
||
# record any incorrect classifications | ||
if not row[1].lower() == "normal" and not row[1].lower() == "anomaly": | ||
error_writer.writerow( | ||
[line, row[0], row[1], "INVALID_CLASSIFICATION"]) | ||
errors_found = True | ||
|
||
# write first image entry to dedup file and record duplicates | ||
key = row[0] | ||
if key not in entries: | ||
dedup_writer.writerow(row) | ||
entries.add(key) | ||
else: | ||
error_writer.writerow([line, row[0], row[1], "DUPLICATE"]) | ||
errors_found = True | ||
line += 1 | ||
|
||
if errors_found: | ||
logger.info(f"Errors found check {errors_file}.") | ||
else: | ||
os.remove(errors_file) | ||
os.remove(deduplicated_file) | ||
|
||
return errors_found | ||
|
||
|
||
def create_manifest_file(csv_file, manifest_file, s3_path): | ||
""" | ||
Reads a CSV file and creates a Lookout for Vision classification manifest file | ||
:param csv_file: The source CSV file | ||
:param manifest_file: The name of the manifest file to create. | ||
:param s3_path: The S3 path to the folder that contains the images. | ||
""" | ||
logger.info(f"Processing CSV file {csv_file}.") | ||
|
||
image_count = 0 | ||
anomalous_count = 0 | ||
|
||
with open(csv_file, newline='') as csvfile, open(manifest_file, "w") as output_file: | ||
|
||
image_classifications = csv.reader( | ||
csvfile, delimiter=',', quotechar='|') | ||
|
||
# process each row (image) in CSV file | ||
for row in image_classifications: | ||
# Skip empty lines | ||
if not ''.join(row).strip(): | ||
continue | ||
|
||
source_ref = str(s3_path) + row[0] | ||
classification = 0 | ||
|
||
if row[1].lower() == 'anomaly': | ||
classification = 1 | ||
anomalous_count += 1 | ||
|
||
json_line = '{"source-ref": "' + source_ref + '",'\ | ||
'"anomaly-label": ' + str(classification) + ','\ | ||
'"anomaly-label-metadata": {' \ | ||
'"confidence": 1,'\ | ||
'"job-name": "labeling-job/anomaly-classification",'\ | ||
'"class-name": "' + row[1] + '",'\ | ||
'"human-annotated": "yes",'\ | ||
'"creation-date": "' + datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%S.%f') + '",'\ | ||
'"type": "groundtruth/image-classification"'\ | ||
'}}\n' | ||
|
||
output_file.write(json_line) | ||
image_count += 1 | ||
|
||
logger.info(f"Finished creating manifest file {manifest_file}.\n" | ||
f"Images: {image_count}\nAnomalous: {anomalous_count}") | ||
return image_count, anomalous_count | ||
|
||
|
||
def add_arguments(parser): | ||
""" | ||
Adds command line arguments to the parser. | ||
:param parser: The command line parser. | ||
""" | ||
|
||
parser.add_argument( | ||
"csv_file", help="The CSV file that you want to process." | ||
) | ||
|
||
parser.add_argument( | ||
"--s3_path", help="The S3 bucket and folder path for the images." | ||
" If not supplied, column 1 is assumed to include the S3 path.", required=False | ||
) | ||
|
||
|
||
def main(): | ||
|
||
logging.basicConfig(level=logging.INFO, | ||
format="%(levelname)s: %(message)s") | ||
|
||
try: | ||
|
||
# get command line arguments | ||
parser = argparse.ArgumentParser(usage=argparse.SUPPRESS) | ||
add_arguments(parser) | ||
args = parser.parse_args() | ||
s3_path = args.s3_path | ||
if s3_path is None: | ||
s3_path = "" | ||
|
||
csv_file = args.csv_file | ||
manifest_file = os.path.splitext(csv_file)[0] + '.manifest' | ||
|
||
# Create manifest file if there are no duplicate images. | ||
if check_errors(csv_file): | ||
print( | ||
f"Issues found. Use {csv_file}_errors to view duplicates and errors.") | ||
print(f"{csv_file}_deduplicated.csv contains the first occurence of a duplicate." | ||
"Update as necessary with the correct information.") | ||
print(f"Re-run the script with {csv_file}_deduplicated.csv") | ||
else: | ||
print('No duplicates found. Creating manifest file') | ||
|
||
image_count, anomalous_count = create_manifest_file(csv_file, | ||
manifest_file, | ||
s3_path) | ||
|
||
print(f"Finished creating manifest file: {manifest_file} \n") | ||
|
||
normal_count = image_count-anomalous_count | ||
print(f"Images processed: {image_count}") | ||
print(f"Normal: {normal_count}") | ||
print(f"Anomalous: {anomalous_count}") | ||
|
||
except FileNotFoundError as err: | ||
logger.exception(f"File not found.:{err}") | ||
print(f"File not found: {err}. Check your input CSV file") | ||
|
||
except Exception as err: | ||
logger.exception(f"An error occured:{err}") | ||
print(f"An error occured:{err}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
|
||
# snippet-end:[python.example_code.lookoutvision.Scenario_CSVtoManifest] |
123 changes: 123 additions & 0 deletions
123
python/example_code/lookoutvision/test/test_csv_to_manifest.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
|
||
from csv_to_manifest import check_errors, create_manifest_file | ||
from os.path import exists | ||
from os import remove | ||
import pytest | ||
|
||
""" | ||
Unit tests for csv_to_manifest.py. | ||
""" | ||
|
||
def clean_up(*argv): | ||
""" | ||
Deletes supplied files, if they exists | ||
Ensures that no previous test run files are present. | ||
""" | ||
for arg in argv: | ||
if exists(arg): | ||
remove(arg) | ||
|
||
|
||
|
||
@pytest.mark.parametrize("csv_file, result", | ||
[ | ||
('test/test_csvs/test_s3_supplied.csv', False) | ||
] | ||
) | ||
def test_check_no_errors(csv_file, result): | ||
""" | ||
Confirms that valid CSV checks OK | ||
""" | ||
|
||
deduplicated_file=f"{csv_file}_deduplicated.csv" | ||
errors_file=f"{csv_file}_errors.csv" | ||
manifest_file = f"{csv_file}_manifest" | ||
|
||
clean_up(deduplicated_file, errors_file,manifest_file) | ||
|
||
assert check_errors(csv_file) == result | ||
assert not exists(deduplicated_file) | ||
assert not exists(errors_file) | ||
|
||
|
||
@pytest.mark.parametrize("csv_file,result", | ||
[ | ||
('test/test_csvs/test_dups_errors.csv', True) | ||
] | ||
) | ||
def test_check_errors(csv_file, result): | ||
""" | ||
Checks that a CSV file with duplications and classification | ||
errors creates the deduplication and errors CSV file. | ||
""" | ||
|
||
deduplicated_file=f"{csv_file}_deduplicated.csv" | ||
errors_file=f"{csv_file}_errors.csv" | ||
manifest_file = f"{csv_file}_manifest" | ||
|
||
clean_up(deduplicated_file, errors_file,manifest_file) | ||
|
||
assert check_errors(csv_file) == result | ||
assert exists(deduplicated_file) | ||
assert exists(errors_file) | ||
assert not exists(manifest_file) | ||
|
||
clean_up(deduplicated_file, errors_file,manifest_file) | ||
|
||
|
||
@pytest.mark.parametrize("csv_file,img_count,anom_count", | ||
[ | ||
("test/test_csvs/test_s3_supplied.csv", 9,5) | ||
] | ||
) | ||
def test_create_manifest_s3_supplied(csv_file, img_count, anom_count): | ||
""" | ||
Checks that a CSV file with images + S3 path creates | ||
a manifest file. | ||
""" | ||
|
||
s3_path="s3://docexamplebucket1/circuitboard/train/" | ||
deduplicated_file=f"{csv_file}_deduplicated.csv" | ||
errors_file=f"{csv_file}_errors.csv" | ||
manifest_file = f"{csv_file}_manifest" | ||
|
||
clean_up(deduplicated_file, errors_file,manifest_file) | ||
|
||
image_count, anomalous_count = create_manifest_file(csv_file, | ||
manifest_file, | ||
s3_path) | ||
assert image_count == img_count | ||
assert anomalous_count == anom_count | ||
assert exists(manifest_file) | ||
assert not exists(deduplicated_file) | ||
assert not exists(errors_file) | ||
|
||
|
||
@pytest.mark.parametrize("csv_file,img_count,anom_count", | ||
[ | ||
('test/test_csvs/test_no_s3.csv', 7,4) | ||
] | ||
) | ||
def test_create_manifest_no_s3_supplied(csv_file,img_count, anom_count): | ||
""" | ||
Checks that a CSV file with images without S3 path creates | ||
a manifest file. | ||
""" | ||
|
||
s3_path="" | ||
deduplicated_file=f"{csv_file}_deduplicated.csv" | ||
errors_file=f"{csv_file}_errors.csv" | ||
manifest_file = f"{csv_file}_manifest" | ||
|
||
clean_up(deduplicated_file, errors_file,manifest_file) | ||
|
||
image_count, anomalous_count = create_manifest_file(csv_file, | ||
manifest_file, | ||
s3_path) | ||
assert image_count == img_count | ||
assert anomalous_count == anom_count | ||
assert exists(manifest_file) | ||
assert not exists(deduplicated_file) | ||
assert not exists(errors_file) | ||
|
||
|
13 changes: 13 additions & 0 deletions
13
python/example_code/lookoutvision/test/test_csvs/test_dups_errors.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_1.jpg,anomaly | ||
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg,anomaly | ||
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg,anomalous | ||
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_11.jpg,anomalous | ||
|
||
s3://docexamplebucket1/circuitboard/train/normal/train-normal_1.jpg,normal | ||
s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg,normal | ||
s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg,normal | ||
s3://docexamplebucket1/circuitboard/train/normal/train-normal_11.jpg,correct | ||
|
||
|
||
|
||
|
11 changes: 11 additions & 0 deletions
11
python/example_code/lookoutvision/test/test_csvs/test_no_s3.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
train-anomaly_1.jpg,anomaly | ||
train-anomaly_10.jpg,anomaly | ||
train-anomaly_11.jpg,anomaly | ||
train-anomaly_12.jpg,anomaly | ||
train-normal_1.jpg,normal | ||
train-normal_10.jpg,normal | ||
train-normal_11.jpg,normal | ||
|
||
|
||
|
||
|
7 changes: 7 additions & 0 deletions
7
python/example_code/lookoutvision/test/test_csvs/test_no_s3.csv_manifest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{"source-ref": "train-anomaly_1.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435927","type": "groundtruth/image-classification"}} | ||
{"source-ref": "train-anomaly_10.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435934","type": "groundtruth/image-classification"}} | ||
{"source-ref": "train-anomaly_11.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435938","type": "groundtruth/image-classification"}} | ||
{"source-ref": "train-anomaly_12.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435941","type": "groundtruth/image-classification"}} | ||
{"source-ref": "train-normal_1.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435945","type": "groundtruth/image-classification"}} | ||
{"source-ref": "train-normal_10.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435949","type": "groundtruth/image-classification"}} | ||
{"source-ref": "train-normal_11.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.435952","type": "groundtruth/image-classification"}} |
13 changes: 13 additions & 0 deletions
13
python/example_code/lookoutvision/test/test_csvs/test_s3_supplied.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_1.jpg,anomaly | ||
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg,anomaly | ||
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_11.jpg,anomaly | ||
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_12.jpg,anomaly | ||
s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_13.jpg,anomaly | ||
s3://docexamplebucket1/circuitboard/train/normal/train-normal_1.jpg,normal | ||
s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg,normal | ||
s3://docexamplebucket1/circuitboard/train/normal/train-normal_11.jpg,normal | ||
s3://docexamplebucket1/circuitboard/train/normal/train-normal_12.jpg,normal | ||
|
||
|
||
|
||
|
9 changes: 9 additions & 0 deletions
9
python/example_code/lookoutvision/test/test_csvs/test_s3_supplied.csv_manifest
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_1.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434742","type": "groundtruth/image-classification"}} | ||
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_10.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434766","type": "groundtruth/image-classification"}} | ||
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_11.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434771","type": "groundtruth/image-classification"}} | ||
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_12.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434775","type": "groundtruth/image-classification"}} | ||
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/anomaly/train-anomaly_13.jpg","anomaly-label": 1,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "anomaly","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434779","type": "groundtruth/image-classification"}} | ||
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_1.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434783","type": "groundtruth/image-classification"}} | ||
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_10.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434787","type": "groundtruth/image-classification"}} | ||
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_11.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434791","type": "groundtruth/image-classification"}} | ||
{"source-ref": "s3://docexamplebucket1/circuitboard/train/s3://docexamplebucket1/circuitboard/train/normal/train-normal_12.jpg","anomaly-label": 0,"anomaly-label-metadata": {"confidence": 1,"job-name": "labeling-job/anomaly-classification","class-name": "normal","human-annotated": "yes","creation-date": "2022-02-14T21:19:38.434794","type": "groundtruth/image-classification"}} |