From 4b151afb396d4d68abe3f160bf5eb3c0da48dab6 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Wed, 6 Oct 2021 20:05:06 -0700 Subject: [PATCH 01/16] remove docs stuff from readme --- README.md | 107 +----------------------------------------------------- 1 file changed, 2 insertions(+), 105 deletions(-) diff --git a/README.md b/README.md index b8bbd3e..c116b38 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,8 @@

Examples for Hub - Dataset Format for AI

- - -A repository showcasing examples of using [Hub](https://github.com/pytorch/pytorch) - - [Uploading Dataset Places365](datasets/places365) + +Note: This repository only contains examples for using [hub](https://github.com/activeloopai/Hub) (no hub source code). ### Colab Tutorials @@ -20,106 +18,5 @@ A repository showcasing examples of using [Hub](https://github.com/pytorch/pytor | Training an Image Classification Model in PyTorch | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/activeloopai/examples/blob/main/colabs/Training_an_Image_Classification_Model_in_PyTorch.ipynb) | - -## Getting Started with Hub πŸš€ - - -### Installation -Hub is written in 100% python and can be quickly installed using pip. -```sh -pip3 install hub -``` - - -### Creating Datasets - -A hub dataset can be created in various locations (Storage providers). This is how the paths for each of them would look like: - -| Storage provider | Example path | -| ---------------- | ----------------------------- | -| Hub cloud | hub://user_name/dataset_name | -| AWS S3 | s3://bucket_name/dataset_name | -| GCP | gcp://bucket_name/dataset_name| -| Local storage | path to local directory | -| In-memory | mem://dataset_name | - - - -Let's create a dataset in the Hub cloud. Create a new account with Hub from the terminal using `activeloop register` if you haven't already. You will be asked for a user name, email id and passowrd. The user name you enter here will be used in the dataset path. - -```sh -$ activeloop register -Enter your details. Your password must be atleast 6 characters long. -Username: -Email: -Password: -``` - -Initialize an empty dataset in the hub cloud: - -```python -import hub - -ds = hub.empty("hub:///test-dataset") -``` - -Next, create a tensor to hold images in the dataset we just initialized: - -```python -images = ds.create_tensor("images", htype="image", sample_compression="jpg") -``` - -Assuming you have a list of image file paths, lets upload them to the dataset: - -```python -image_paths = ... -with ds: - for image_path in image_paths: - image = hub.read(image_path) - ds.images.append(image) -``` - -Alternatively, you can also upload numpy arrays. Since the `images` tensor was created with `sample_compression="jpg"`, the arrays will be compressed with jpeg compression. - - -```python -import numpy as np - -with ds: - for _ in range(1000): # 1000 random images - radnom_image = np.random.randint(0, 256, (100, 100, 3)) # 100x100 image with 3 channels - ds.images.append(image) -``` - - - -### Loading Datasets - - -You can load the dataset you just created with a single line of code: - -```python -import hub - -ds = hub.load("hub:///test-dataset") -``` - -You can also access other publicly available hub datasets, not just the ones you created. Here is how you would load the [Objectron Bikes Dataset](https://github.com/google-research-datasets/Objectron): - -```python -import hub - -ds = hub.load('hub://activeloop/objectron_bike_train') -``` - -To get the first image in the Objectron Bikes dataset in numpy format: - - -```python -image_arr = ds.image[0].numpy() -``` - - - ## Documentation Getting started guides, examples, tutorials, API reference, and other usage information can be found on our [documentation page](http://docs.activeloop.ai/?utm_source=github&utm_medium=repo&utm_campaign=readme). From 7f75ff9cc1f1d11fb9efb35aab36ce4b54cb0738 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Wed, 6 Oct 2021 20:35:35 -0700 Subject: [PATCH 02/16] remove messy examples and add first super simple example --- .gitignore | 2 + colabs/Creating_Complex_Datasets.ipynb | 402 ----- .../Creating_Object_Detection_Datasets.ipynb | 362 ---- ..._Processing_Using_Parallel_Computing.ipynb | 327 ---- colabs/Getting_Started_with_Hub.ipynb | 1601 ----------------- ...mage_Classification_Model_in_PyTorch.ipynb | 390 ---- datasets/places365/upload.py | 115 -- hub_examples/uploading/npy.py | 35 + 8 files changed, 37 insertions(+), 3197 deletions(-) delete mode 100644 colabs/Creating_Complex_Datasets.ipynb delete mode 100644 colabs/Creating_Object_Detection_Datasets.ipynb delete mode 100644 colabs/Data_Processing_Using_Parallel_Computing.ipynb delete mode 100644 colabs/Getting_Started_with_Hub.ipynb delete mode 100644 colabs/Training_an_Image_Classification_Model_in_PyTorch.ipynb delete mode 100644 datasets/places365/upload.py create mode 100644 hub_examples/uploading/npy.py diff --git a/.gitignore b/.gitignore index b6e4761..da37e0d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ __pycache__/ *.py[cod] *$py.class +_datasets/* + # C extensions *.so diff --git a/colabs/Creating_Complex_Datasets.ipynb b/colabs/Creating_Complex_Datasets.ipynb deleted file mode 100644 index 668dea3..0000000 --- a/colabs/Creating_Complex_Datasets.ipynb +++ /dev/null @@ -1,402 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Creating Complex Datasets", - "provenance": [], - "collapsed_sections": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lKU8kmSs65xv" - }, - "source": [ - "# ***Creating Complex Detection Datasets***\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3zK9b4yiMRzB" - }, - "source": [ - "#### Datasets often have multiple labels such as classifications, bounding boxes, segmentations, and others. In order to create an intuitive layout of tensors, it's advisable to create a dataset hierarchy that captures the relationship between the different label types. This can be done with hub tensor `groups`.\n", - "\n", - "#### This example show to to use groups to create a dataset containing image classifications of \"indoor\" and \"outdoor\", as well as bounding boxes of objects such as \"dog\" and \"cat\". " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3UseHLcoRIYz" - }, - "source": [ - "## Install Hub" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "l5mOffq5RN-T" - }, - "source": [ - "from IPython.display import clear_output\n", - "!pip3 install hub\n", - "clear_output()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "SOkA83IsRWYo" - }, - "source": [ - "# IMPORTANT - Please restart your Colab runtime after installing Hub!\n", - "# This is a Colab-specific issue that prevents PIL from working properly.\n", - "import os\n", - "os.kill(os.getpid(), 9)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7wGo53ndMTCB" - }, - "source": [ - "## Create the Hub Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "52h9xKujOJFs" - }, - "source": [ - "The first step is to download the small dataset below called *animals complex*." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "V6m__biyt5I1" - }, - "source": [ - "# Download dataset\n", - "from IPython.display import clear_output\n", - "!wget https://firebasestorage.googleapis.com/v0/b/gitbook-28427.appspot.com/o/assets%2F-M_MXHpa1Cq7qojD2u_r%2F-MjzA6TRhECk7QihMIFb%2F-MjzIXAcW0ORldVig8Rk%2Fanimals_complex.zip?alt=media&token=5e25aef5-21d8-43b8-afa5-01dd4a20d2e6\n", - "clear_output()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "_fNxNZIft5F-" - }, - "source": [ - "# Unzip to './animals_od' folder\n", - "!unzip -qq /content/assets%2F-M_MXHpa1Cq7qojD2u_r%2F-MjzA6TRhECk7QihMIFb%2F-MjzIXAcW0ORldVig8Rk%2Fanimals_complex.zip?alt=media" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bLh4uuIMuNwt" - }, - "source": [ - "The dataset has the following folder structure:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JHyrqNgNuRO2" - }, - "source": [ - "animals_complex\n", - "- classification\n", - " - indoor\n", - " - image1.png\n", - " - image2.png\n", - " - outdoor\n", - " - image3.png\n", - " - image4.png\n", - "- boxes\n", - " - image1.txt\n", - " - image3.txt\n", - " - image3.txt\n", - " - image4.txt\n", - " - classes.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g_iOi_9NuXAI" - }, - "source": [ - "Now that you have the data, let's create a Hub Dataset in the `./animals_complex_hub` folder by running: \n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "qaZtpnpTOp-5" - }, - "source": [ - "import hub\n", - "from PIL import Image, ImageDraw\n", - "import numpy as np\n", - "import os\n", - "\n", - "ds = hub.empty('./animals_complex_hub') # Create the dataset" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qNMOv3LPOyAd" - }, - "source": [ - "Next, let's specify the folder paths containing the classification and object detection data. It's also helpful to create a list of all of the image files and class names for classification and object detection tasks." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZCjN0EKwO1Pu" - }, - "source": [ - "classification_folder = './animals_complex/classification'\n", - "boxes_folder = './animals_complex/boxes'\n", - "\n", - "# List of all class names for classification\n", - "class_names = os.listdir(classification_folder)\n", - "\n", - "fn_imgs = []\n", - "for dirpath, dirnames, filenames in os.walk(classification_folder):\n", - " for filename in filenames:\n", - " fn_imgs.append(os.path.join(dirpath, filename))\n", - "\n", - "# List of all class names for object detection \n", - "with open(os.path.join(boxes_folder, 'classes.txt'), 'r') as f:\n", - " class_names_boxes = f.read().splitlines()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g4CPD4nmO3_S" - }, - "source": [ - "Since annotations in YOLO are typically stored in text files, it's useful to write a helper function that parses the annotation file and returns numpy arrays with the bounding box coordinates and bounding box classes." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "bRIDfYXNO7kg" - }, - "source": [ - "def read_yolo_boxes(fn:str):\n", - " \"\"\"\n", - " Function reads a label.txt YOLO file and returns a numpy array of yolo_boxes \n", - " for the box geometry and yolo_labels for the corresponding box labels.\n", - " \"\"\"\n", - " \n", - " box_f = open(fn)\n", - " lines = box_f.read()\n", - " box_f.close()\n", - " \n", - " # Split each box into a separate lines\n", - " lines_split = lines.splitlines()\n", - " \n", - " yolo_boxes = np.zeros((len(lines_split),4))\n", - " yolo_labels = np.zeros(len(lines_split))\n", - " \n", - " # Go through each line and parse data\n", - " for l, line in enumerate(lines_split):\n", - " line_split = line.split()\n", - " yolo_boxes[l,:]=np.array((float(line_split[1]), float(line_split[2]), float(line_split[3]), float(line_split[4])))\n", - " yolo_labels[l]=int(line_split[0]) \n", - " \n", - " return yolo_boxes, yolo_labels" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WKvPUjxcUPvO" - }, - "source": [ - "Next, let's create the groups and tensors for this data. In order to separate the two annotations, a `boxes` group is created to wrap around the `label` and `bbox` tensors which contains the coordinates and labels for the bounding boxes." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "u2F4TXd0UEtH" - }, - "source": [ - "with ds:\n", - " # Image\n", - " ds.create_tensor('images', htype='image', sample_compression='jpeg')\n", - " \n", - " # Classification\n", - " ds.create_tensor('labels', htype='class_label', class_names = class_names)\n", - " \n", - " # Object Detection\n", - " ds.create_group('boxes')\n", - " ds.boxes.create_tensor('bbox', htype='bbox')\n", - " ds.boxes.create_tensor('label', htype='class_label', class_names = class_names_boxes)\n", - " # An alternate approach is to use '/' notation, which automatically creates the boxes group\n", - " # ds.create_tensor('boxes/bbox', ...)\n", - " # ds.create_tensor('boxes/label', ...)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IlCH5Xy8NTvx" - }, - "source": [ - "Finally, let's iterate through all the images in the dataset in order to populate the data in Hub. The first axis of the `boxes.bbox` sample array corresponds to the first-and-only axis of the `boxes.label` sample array (i.e. if there are 3 boxes in an image, the labels array is 3x1 and the boxes array is 3x4)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "wz5lsbCeNUDg" - }, - "source": [ - "with ds:\n", - " #Iterate throgh the images\n", - " for fn_img in fn_imgs:\n", - " \n", - " img_name = os.path.splitext(os.path.basename(fn_img))[0]\n", - " fn_box = img_name+'.txt'\n", - " \n", - " # Get the class number for the classification\n", - " label_text = os.path.basename(os.path.dirname(fn_img))\n", - " label_num = class_names.index(label_text)\n", - " \n", - " # Get the arrays for the bounding boxes and their classes\n", - " yolo_boxes, yolo_labels = read_yolo_boxes(os.path.join(boxes_folder,fn_box))\n", - " \n", - " # Append classification data to tensors\n", - " ds.images.append(hub.read(os.path.join(fn_img)))\n", - " ds.labels.append(np.uint32(label_num))\n", - " \n", - " # Append object detection data to tensors\n", - " ds.boxes.label.append(yolo_labels.astype(np.uint32))\n", - " ds.boxes.bbox.append(yolo_boxes.astype(np.float32))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aYCI61o-O9CV" - }, - "source": [ - "##Inspect the Hub Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pXkD-gLgO_7L" - }, - "source": [ - "Let's check out the third sample from this dataset, which contains two bounding boxes." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PEPTKmCiPD-T" - }, - "source": [ - "# Draw bounding boxes and the classfication label for the second image\n", - "\n", - "ind = 1\n", - "img = Image.fromarray(ds.images[ind].numpy())\n", - "draw = ImageDraw.Draw(img)\n", - "(w,h) = img.size\n", - "boxes = ds.boxes.bbox[ind].numpy()\n", - "\n", - "for b in range(boxes.shape[0]):\n", - " (xc,yc) = (int(boxes[b][0]*w), int(boxes[b][1]*h))\n", - " (x1,y1) = (int(xc-boxes[b][2]*w/2), int(yc-boxes[b][3]*h/2))\n", - " (x2,y2) = (int(xc+boxes[b][2]*w/2), int(yc+boxes[b][3]*h/2))\n", - " draw.rectangle([x1,y1,x2,y2], width=2)\n", - " draw.text((x1,y1), ds.boxes.label.info.class_names[ds.boxes.label[ind].numpy()[b]])\n", - " draw.text((0,0), ds.labels.info.class_names[ds.labels[ind].numpy()[0]])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "XZMcRLeQPHq6" - }, - "source": [ - "# Display the image and its bounding boxes\n", - "img" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "79QnkE-UUySP" - }, - "source": [ - "Congrats! You just created a dataset with multiple types of annotations! πŸŽ‰" - ] - } - ] -} diff --git a/colabs/Creating_Object_Detection_Datasets.ipynb b/colabs/Creating_Object_Detection_Datasets.ipynb deleted file mode 100644 index c4f8fa1..0000000 --- a/colabs/Creating_Object_Detection_Datasets.ipynb +++ /dev/null @@ -1,362 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Creating Object Detection Datasets", - "provenance": [], - "collapsed_sections": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lKU8kmSs65xv" - }, - "source": [ - "# ***Creating Object Detection Datasets***\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3zK9b4yiMRzB" - }, - "source": [ - "#### Object detection and image annotation using bounding boxes is one of the most common data types for Computer Vision datasets. This tutorial demonstrates how to convert an object detection dataset in YOLO format into Hub, and a similar process can be used for uploading object detection data in other formats." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3UseHLcoRIYz" - }, - "source": [ - "## Install Hub" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "l5mOffq5RN-T" - }, - "source": [ - "from IPython.display import clear_output\n", - "!pip3 install hub\n", - "clear_output()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "SOkA83IsRWYo" - }, - "source": [ - "# IMPORTANT - Please restart your Colab runtime after installing Hub!\n", - "# This is a Colab-specific issue that prevents PIL from working properly.\n", - "import os\n", - "os.kill(os.getpid(), 9)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7wGo53ndMTCB" - }, - "source": [ - "## Create the Hub Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "52h9xKujOJFs" - }, - "source": [ - "The first step is to download the small dataset below called *animals object detection*." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "V6m__biyt5I1" - }, - "source": [ - "# Download dataset\n", - "from IPython.display import clear_output\n", - "!wget https://firebasestorage.googleapis.com/v0/b/gitbook-28427.appspot.com/o/assets%2F-M_MXHpa1Cq7qojD2u_r%2F-MiagkMwYEcI7NIOZgRQ%2F-MiaqvVXGkxDkTTxGR1K%2Fanimals_od.zip?alt=media&token=35854752-3b67-4248-a621-96b63daf00d3\n", - "clear_output()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "_fNxNZIft5F-" - }, - "source": [ - "# Unzip to './animals_od' folder\n", - "!unzip -qq /content/assets%2F-M_MXHpa1Cq7qojD2u_r%2F-MiagkMwYEcI7NIOZgRQ%2F-MiaqvVXGkxDkTTxGR1K%2Fanimals_od.zip?alt=media" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bLh4uuIMuNwt" - }, - "source": [ - "The dataset has the following folder structure:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JHyrqNgNuRO2" - }, - "source": [ - "animals_od\n", - "- images\n", - " - image_1.jpg\n", - " - image_2.jpg\n", - " - image_3.jpg\n", - " - image_4.jpg\n", - "- boxes\n", - " - image_1.txt\n", - " - image_2.txt\n", - " - image_3.txt\n", - " - image_4.txt\n", - " - classes.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g_iOi_9NuXAI" - }, - "source": [ - "Now that you have the data, let's **create a Hub Dataset** in the `./animals_od_hub` folder by running:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "qaZtpnpTOp-5" - }, - "source": [ - "import hub\n", - "from PIL import Image, ImageDraw\n", - "import numpy as np\n", - "import os\n", - "\n", - "ds = hub.empty('./animals_od_hub') # Create the dataset" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qNMOv3LPOyAd" - }, - "source": [ - "Next, let's specify the folder paths containing the images and annotations in the dataset. In YOLO format, images and annotations are typically matched using a common filename such as `image -> filename.jpeg` and `annotation -> filename.txt` . It's also helpful to create a list of all of the image files and the class names contained in the dataset." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZCjN0EKwO1Pu" - }, - "source": [ - "img_folder = './animals_od/images'\n", - "lbl_folder = './animals_od/boxes'\n", - "\n", - "# List of all images\n", - "fn_imgs = os.listdir(img_folder)\n", - "\n", - "# List of all class names\n", - "with open(os.path.join(lbl_folder, 'classes.txt'), 'r') as f:\n", - " class_names = f.read().splitlines()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g4CPD4nmO3_S" - }, - "source": [ - "Since annotations in YOLO are typically stored in text files, it's useful to write a helper function that parses the annotation file and returns numpy arrays with the bounding box coordinates and bounding box classes." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "bRIDfYXNO7kg" - }, - "source": [ - "def read_yolo_boxes(fn:str):\n", - " \"\"\"\n", - " Function reads a label.txt YOLO file and returns a numpy array of yolo_boxes \n", - " for the box geometry and yolo_labels for the corresponding box labels.\n", - " \"\"\"\n", - " \n", - " box_f = open(fn)\n", - " lines = box_f.read()\n", - " box_f.close()\n", - " \n", - " # Split each box into a separate lines\n", - " lines_split = lines.splitlines()\n", - " \n", - " yolo_boxes = np.zeros((len(lines_split),4))\n", - " yolo_labels = np.zeros(len(lines_split))\n", - " \n", - " # Go through each line and parse data\n", - " for l, line in enumerate(lines_split):\n", - " line_split = line.split()\n", - " yolo_boxes[l,:]=np.array((float(line_split[1]), float(line_split[2]), float(line_split[3]), float(line_split[4])))\n", - " yolo_labels[l]=int(line_split[0]) \n", - " \n", - " return yolo_boxes, yolo_labels" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WKvPUjxcUPvO" - }, - "source": [ - "Finally, let's create the tensors and iterate through all the images in the dataset in order to populate the data in Hub. Boxes and their labels will be stored in separate tensors, and for a given sample, the first axis of the boxes array corresponds to the first-and-only axis of the labels array (i.e. if there are 3 boxes in an image, the labels array is 3x1 and the boxes array is 3x4)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "u2F4TXd0UEtH" - }, - "source": [ - "with ds:\n", - " ds.create_tensor('images', htype='image', sample_compression = 'jpeg')\n", - " ds.create_tensor('labels', htype='class_label', class_names = class_names)\n", - " ds.create_tensor('boxes\", htype='bbox')\n", - "\n", - " for fn_img in fn_imgs:\n", - "\n", - " img_name = os.path.splitext(fn_img)[0]\n", - " fn_box = img_name+'.txt'\n", - "\n", - " # Get the arrays for the bounding boxes and their classes\n", - " yolo_boxes, yolo_labels = read_yolo_boxes(os.path.join(lbl_folder,fn_box))\n", - " \n", - " # Append data to tensors\n", - " ds.images.append(hub.read(os.path.join(img_folder, fn_img)))\n", - " ds.labels.append(yolo_labels.astype(np.uint32))\n", - " ds.boxes.append(yolo_boxes.astype(np.float32))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aYCI61o-O9CV" - }, - "source": [ - "##Inspect the Hub Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pXkD-gLgO_7L" - }, - "source": [ - "Let's check out the third sample from this dataset, which contains two bounding boxes." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PEPTKmCiPD-T" - }, - "source": [ - "# Draw bounding boxes for the third image\n", - "\n", - "ind = 2\n", - "img = Image.fromarray(ds.images[ind ].numpy())\n", - "draw = ImageDraw.Draw(img)\n", - "(w,h) = img.size\n", - "boxes = ds.boxes[ind ].numpy()\n", - "\n", - "for b in range(boxes.shape[0]):\n", - " (xc,yc) = (int(boxes[b][0]*w), int(boxes[b][1]*h))\n", - " (x1,y1) = (int(xc-boxes[b][2]*w/2), int(yc-boxes[b][3]*h/2))\n", - " (x2,y2) = (int(xc+boxes[b][2]*w/2), int(yc+boxes[b][3]*h/2))\n", - " draw.rectangle([x1,y1,x2,y2], width=2)\n", - " draw.text((x1,y1), ds.labels.info.class_names[ds.labels[ind].numpy()[b]])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "XZMcRLeQPHq6" - }, - "source": [ - "# Display the image and its bounding boxes\n", - "img" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Bg8rUpSWPJoK" - }, - "source": [ - "**Note:** For optimal object detection model performance, it is often important for datasets to contain images with no annotations (See the 4th sample in the dataset above). For that use case, in order to maintain equal length between the images, boxes, and labels tensors, users can upload empty numpy arrays as long as `len(sample.shape)` for an empty and non-empty sample is equal. Therefore, an empty bounding box can be added using `ds.boxes.append(np.zeros(0,4))` because `len(sample.shape) == 2`, just like for a bounding box with data." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "79QnkE-UUySP" - }, - "source": [ - "Congrats! You just created a beautiful object detection dataset! πŸŽ‰" - ] - } - ] -} diff --git a/colabs/Data_Processing_Using_Parallel_Computing.ipynb b/colabs/Data_Processing_Using_Parallel_Computing.ipynb deleted file mode 100644 index e561179..0000000 --- a/colabs/Data_Processing_Using_Parallel_Computing.ipynb +++ /dev/null @@ -1,327 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Data Processing Using Parallel Computing", - "provenance": [], - "collapsed_sections": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lKU8kmSs65xv" - }, - "source": [ - "# ***Data Processing Using Parallel Computing***" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3zK9b4yiMRzB" - }, - "source": [ - "#### [Step 7](https://docs.activeloop.ai/getting-started/parallel-computing) in the [Getting Started Guide](https://docs.activeloop.ai/getting-started) highlights how `hub.compute` can be used to rapidly upload datasets. This tutorial expands further and highlights the power of parallel computing for dataset processing." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3UseHLcoRIYz" - }, - "source": [ - "## Install Hub" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "l5mOffq5RN-T" - }, - "source": [ - "from IPython.display import clear_output\n", - "!pip3 install hub\n", - "clear_output()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "SOkA83IsRWYo" - }, - "source": [ - "# IMPORTANT - Please restart your Colab runtime after installing Hub!\n", - "# This is a Colab-specific issue that prevents PIL from working properly.\n", - "import os\n", - "os.kill(os.getpid(), 9)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7wGo53ndMTCB" - }, - "source": [ - "## Dataset Transformations" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "52h9xKujOJFs" - }, - "source": [ - "Computer vision applications often require users to process and transform their data as part of their workflows. For example, you may perform perspective transforms, resize images, adjust their coloring, or many others. In this example, a flipped version of the MNIST dataset is created, which may be useful for training a model that identifies text from reflections in a mirror.\n", - "\n", - "The first step to creating a flipped version of the MNIST dataset is to define a function that will flip the dataset images." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "qaZtpnpTOp-5" - }, - "source": [ - "import hub\n", - "from PIL import Image\n", - "import numpy as np\n", - "\n", - "@hub.compute\n", - "def flip_horizontal(sample_in, sample_out):\n", - " ## First two arguments are always default arguments containing:\n", - " # 1st argument is an element of the input iterable (list, dataset, array,...)\n", - " # 2nd argument is a dataset sample\n", - " \n", - " # Append the label and image to the output sample\n", - " sample_out.labels.append(sample_in.labels.numpy())\n", - " sample_out.images.append(np.flip(sample_in.images.numpy(), axis = 1))\n", - " \n", - " return sample_out" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qNMOv3LPOyAd" - }, - "source": [ - "Next, the existing MNIST dataset is loaded, and `hub.like` is used to create an empty dataset with the same tensor structure." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZCjN0EKwO1Pu" - }, - "source": [ - "ds_mnist = hub.load('hub://activeloop/mnist-train')\n", - "\n", - "#We use the overwrite=True to make this code re-runnable\n", - "ds_mnist_flipped = hub.like('./mnist_flipped', ds_mnist, overwrite = True)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g4CPD4nmO3_S" - }, - "source": [ - "Finally, the flipping operation is evaluated for the 1st 100 elements in the input dataset `ds_mnist`, and the result is automatically stored in `ds_mnist_flipped`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "bRIDfYXNO7kg" - }, - "source": [ - "flip_horizontal().eval(ds_mnist[0:100], ds_mnist_flipped, num_workers = 2)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WKvPUjxcUPvO" - }, - "source": [ - "Let's check out the flipped images:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "u2F4TXd0UEtH" - }, - "source": [ - "Image.fromarray(ds_mnist.images[0].numpy())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "C3xsuWQTUEdm" - }, - "source": [ - "Image.fromarray(ds_mnist_flipped.images[0].numpy())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aYCI61o-O9CV" - }, - "source": [ - "##Dataset Processing Pipelines" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pXkD-gLgO_7L" - }, - "source": [ - "In order to modularize your dataset processing, it is often helpful to create functions for specific data processing tasks, and combine them in pipelines in order to transform your data end-to-end. In this example, you can create a pipeline using the `flip_horizontal` function above and the `resize` function below." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PEPTKmCiPD-T" - }, - "source": [ - "@hub.compute\n", - "def resize(sample_in, sample_out, new_size):\n", - " ## First two arguments are always default arguments containing:\n", - " # 1st argument is an element of the input iterable (list, dataset, array,...)\n", - " # 2nd argument is a dataset sample\n", - " ## Third argument is the required size for the output images\n", - " \n", - " # Append the label and image to the output sample\n", - " sample_out.labels.append(sample_in.labels.numpy())\n", - " sample_out.images.append(np.array(Image.fromarray(sample_in.images.numpy()).resize(new_size)))\n", - " \n", - " return sample_out" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MDtcpMmuPFdg" - }, - "source": [ - "Functions decorated using `hub.compute` can be easily combined into pipelines using hub.compose. Required arguments for the functions must be passed into the pipeline in this step:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "XZMcRLeQPHq6" - }, - "source": [ - "pipeline = hub.compose([flip_horizontal(), resize(new_size = (64,64))])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Bg8rUpSWPJoK" - }, - "source": [ - "Just like for the single-function example above, the input and output datasets are created first, and the pipeline is evaluated for the 1st 100 elements in the input dataset `ds_mnist_flipped`. The result is automatically stored in `ds_mnist_pipe`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "nnbEyjv2PL0a" - }, - "source": [ - "#We use the overwrite=True to make this code re-runnable\n", - "ds_mnist_pipe = hub.like('./mnist_pipeline', ds_mnist, overwrite = True)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "A4fkclYcPNjM" - }, - "source": [ - "pipeline.eval(ds_mnist[0:100], ds_mnist_pipe, num_workers = 2)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "79QnkE-UUySP" - }, - "source": [ - "Let's check out the processed images:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "7CrmZbCtUzMV" - }, - "source": [ - "Image.fromarray(ds_mnist.images[0].numpy())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "j8itwG67U2g_" - }, - "source": [ - "Image.fromarray(ds_mnist_pipe.images[0].numpy())" - ], - "execution_count": null, - "outputs": [] - } - ] -} diff --git a/colabs/Getting_Started_with_Hub.ipynb b/colabs/Getting_Started_with_Hub.ipynb deleted file mode 100644 index e69a2e1..0000000 --- a/colabs/Getting_Started_with_Hub.ipynb +++ /dev/null @@ -1,1601 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": " Getting Started with Hub 2.0", - "provenance": [], - "collapsed_sections": [ - "lKU8kmSs65xv", - "ZrjGQON37lk2", - "0N-f2SYU7OjQ", - "bR5n8yYg-0Wu", - "G-DM6PKq_di2", - "46H4nEnZDv5m", - "JGo-E8Z8Ho6F", - "A8Mye_Z5Htut", - "1Kb9q_ZqIARN", - "bjmnRLWHINXG", - "NQipSo2OF_lB", - "LVma__gxGq97", - "Bnr9ItdkGzDk", - "x5bX92ZUG_2F", - "guao84xTb4Zg", - "iXRCphquSFs3", - "_CEF-kjySdLp", - "kWvgUH25Tj8V", - "2JRpqeYqV-oT", - "HCrKgp6FYDG9", - "i1GqH1JvYkNP", - "EnTPLIS5i7yE", - "uinXs4r1i7Zz" - ], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lKU8kmSs65xv" - }, - "source": [ - "# **Step 1**: _Hello World_" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZrjGQON37lk2" - }, - "source": [ - "## Installing Hub" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9pcfYcPu7KxY" - }, - "source": [ - "Hub can be installed via `pip`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oC_N5qOx6o0d" - }, - "source": [ - "from IPython.display import clear_output\n", - "!pip3 install hub\n", - "clear_output()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "pWJeNh83XfrD" - }, - "source": [ - "# IMPORTANT - Please restart your Colab runtime after installing Hub!\n", - "# This is a Colab-specific issue that prevents some imports from working properly.\n", - "import os\n", - "os.kill(os.getpid(), 9)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0N-f2SYU7OjQ" - }, - "source": [ - "## Fetching your first Hub dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9aNFn7rZ7qxP" - }, - "source": [ - "Begin by loading in [MNIST](https://en.wikipedia.org/wiki/MNIST_database), the hello world dataset of machine learning. \n", - "\n", - "First, load the `Dataset` by pointing to its storage location. Datasets hosted on the Activeloop Platform are typically identified by the namespace of the organization followed by the dataset name: `activeloop/mnist-train`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "izccjS4k7NvX" - }, - "source": [ - "import hub\n", - "\n", - "dataset_path = 'hub://activeloop/mnist-train'\n", - "ds = hub.load(dataset_path) # Returns a Hub Dataset but does not download data locally" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bR5n8yYg-0Wu" - }, - "source": [ - "## Reading Samples From a Hub Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0XdaAKaS-3NO" - }, - "source": [ - "Data is not immediately read into memory because Hub operates [lazily](https://en.wikipedia.org/wiki/Lazy_evaluation). You can fetch data by calling the `.numpy()` method, which reads data into a NumPy array.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6qpQeNoq-xfo" - }, - "source": [ - "# Indexing\n", - "W = ds.images[0].numpy() # Fetch image return a NumPy array\n", - "X = ds.labels[0].numpy(aslist=True) # Fetch label and store as list of NumPy array\n", - "\n", - "# Slicing\n", - "Y = ds.images[0:100].numpy() # Fetch 100 images and return a NumPy array if possible\n", - " # This method produces an exception if\n", - " # the shape of the images is not equal\n", - "Z = ds.labels[0:100].numpy(aslist=True) # Fetch 100 labels and store as list of \n", - " # NumPy arrays" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "eNGHXfdKwJ7W" - }, - "source": [ - "print('X is {}'.format(X))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tmi2w0_e_LtH" - }, - "source": [ - "Congratulations, you've got Hub working on your local machine! πŸ€“" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "G-DM6PKq_di2" - }, - "source": [ - "# **Step 2**: _Creating Hub Datasets_\n", - "*Creating and storing Hub Datasets manually.*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FEzK8LTe_gJW" - }, - "source": [ - "Creating Hub datasets is simple, you have full control over connecting your source data (files, images, etc.) to specific tensors in the Hub Dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EGXGvKU1qsp1" - }, - "source": [ - "## Manual Creation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CQk29Mnhqn1V" - }, - "source": [ - "Let's follow along with the example below to create our first dataset. First, download and unzip the small classification dataset below called the *animals dataset*." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "QDJRrlDP_DsW" - }, - "source": [ - "# Download dataset\n", - "from IPython.display import clear_output\n", - "!wget https://firebasestorage.googleapis.com/v0/b/gitbook-28427.appspot.com/o/assets%2F-M_MXHpa1Cq7qojD2u_r%2F-MbI7YlHiBJg6Fg-HsOf%2F-MbIUlXZn7EYdgDNncOI%2Fanimals.zip?alt=media&token=c491c2cb-7f8b-4b23-9617-a843d38ac611\n", - "clear_output()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "SIQf9cY6_vyn" - }, - "source": [ - "# Unzip to './animals' folder\n", - "!unzip -qq /content/assets%2F-M_MXHpa1Cq7qojD2u_r%2F-MbI7YlHiBJg6Fg-HsOf%2F-MbIUlXZn7EYdgDNncOI%2Fanimals.zip?alt=media" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IIz-MYImAfCg" - }, - "source": [ - "The dataset has the following folder structure:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IuhZZqVIAqj_" - }, - "source": [ - "animals\n", - "- cats\n", - " - image_1.jpg\n", - " - image_2.jpg\n", - "- dogs\n", - " - image_3.jpg\n", - " - image_4.jpg" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6Lez5uCJAto4" - }, - "source": [ - "Now that you have the data, you can **create a Hub `Dataset`** and initialize its tensors. Running the following code will create a Hub dataset inside of the `./animals_hub` folder.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "qtzmT0iBNV23" - }, - "source": [ - "import hub\n", - "from PIL import Image\n", - "import numpy as np\n", - "import os\n", - "\n", - "ds = hub.empty('./animals_hub') # Creates the dataset" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PQ5yt0aaNeP5" - }, - "source": [ - "Next, let's inspect the folder structure for the source dataset `'./animals'` to find the class names and the files that need to be uploaded to the Hub dataset." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ubGLkgG8Njbb" - }, - "source": [ - "# Find the class_names and list of files that need to be uploaded\n", - "dataset_folder = './animals'\n", - "\n", - "class_names = os.listdir(dataset_folder)\n", - "\n", - "files_list = []\n", - "for dirpath, dirnames, filenames in os.walk(dataset_folder):\n", - " for filename in filenames:\n", - " files_list.append(os.path.join(dirpath, filename))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CtVSh0FnNmyI" - }, - "source": [ - "Next, let's **create the dataset tensors and upload metadata**. Check out our page on [Storage Synchronization](https://docs.activeloop.ai/how-hub-works/storage-synchronization) for details about the `with` syntax below.\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "a6QDC6caNpiH" - }, - "source": [ - "with ds:\n", - " # Create the tensors with names of your choice.\n", - " ds.create_tensor('images', htype = 'image', sample_compression = 'jpeg')\n", - " ds.create_tensor('labels', htype = 'class_label', class_names = class_names)\n", - "\n", - " # Add arbitrary metadata - Optional\n", - " ds.info.update(description = 'My first Hub dataset')\n", - " ds.images.info.update(camera_type = 'SLR')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TD-hCSBKBA_m" - }, - "source": [ - "**Note:** Specifying `htype` and `dtype` is not required, but it is highly recommended in order to optimize performance, especially for large datasets. Use `dtype` to specify the numeric type of tensor data, and use `htype` to specify the underlying data structure. More information on `htype` can be found [here](https://api-docs.activeloop.ai/htypes.html)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HR4kLo6YBOhO" - }, - "source": [ - "Finally, let's **populate the data** in the tensors. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "0QRAyS-HA-Fp" - }, - "source": [ - "with ds:\n", - " # Iterate through the files and append to hub dataset\n", - " for file in files_list:\n", - " label_text = os.path.basename(os.path.dirname(file))\n", - " label_num = class_names.index(label_text)\n", - " \n", - " ds.images.append(hub.read(file)) # Append to images tensor using hub.read\n", - " ds.labels.append(np.uint32(label_num)) # Append to labels tensor" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lWqYzfI1DCPG" - }, - "source": [ - "**Note:** `ds.images.append(hub.read(path))` is functionally equivalent to `ds.image.append(PIL.Image.fromarray(path))`. However, the `hub.read()` method is significantly faster because it does not decompress and recompress the image if the compression matches the `sample_compression` for that tensor. Further details are available in the next section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WzHVb521XSud" - }, - "source": [ - "Check out the first image from this dataset. More details about Accessing Data are available in **Step 5**." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "OMG2oif0XSDZ" - }, - "source": [ - "Image.fromarray(ds.images[0].numpy())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "g8E_f-eXqy1c" - }, - "source": [ - "## Automatic Creation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MCjy5dH9q3Gi" - }, - "source": [ - "The above animals dataset can also be converted to Hub format automatically using 1 line of code:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "CUtOL7F8q1xB" - }, - "source": [ - "src = \"./animals\"\n", - "dest = './animals_hub_auto'\n", - "\n", - "ds = hub.ingest(src, dest)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "o6xboPUKrs1l" - }, - "source": [ - "Image.fromarray(ds.images[0].numpy())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "03b3r7owq7o8" - }, - "source": [ - "**Note**: Automatic creation currently only supports image classification datasets, though support for other dataset types is continually being added. A full list of supported datasets is available [here](https://api-docs.activeloop.ai/#hub.ingest)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PK_wpkYsDdH2" - }, - "source": [ - "## Creating Tensor Hierarchies" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1btlOtBDDe4G" - }, - "source": [ - "Often it's important to create tensors hierarchically, because information between tensors may be inherently coupledβ€”such as bounding boxes and their corresponding labels. Hierarchy can be created using tensor `groups`:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ICg3Z1z8CRGN" - }, - "source": [ - "ds = hub.empty('./groups_test') # Creates the dataset\n", - "\n", - "# Create tensor hierarchies\n", - "ds.create_group('my_group')\n", - "ds.my_group.create_tensor('my_tensor')\n", - "\n", - "# Alternatively, a group can us created using create_tensor with '/'\n", - "ds.create_tensor('my_group_2/my_tensor') #Automatically creates the group 'my_group_2'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wE-rWBCkpI9T" - }, - "source": [ - "Tensors in groups are accessed via:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "78s3Oa_jpKXV" - }, - "source": [ - "ds.my_group.my_tensor" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3fhjWZ9hDvKe" - }, - "source": [ - "For more detailed information regarding accessing datasets and their tensors, check out the next section." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "46H4nEnZDv5m" - }, - "source": [ - "# **Step 3**: _Understanding Compression_\n", - "\n", - "*Using compression to achieve optimal performance.*" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_ajldDggEp8O" - }, - "source": [ - "All sample data in Hub can be stored in a raw uncompressed format. However, in order to achieve optimal performance in terms of speed and memory, it is critical to specify an appropriate compression method for your data.\n", - "\n", - "For example, when creating a tensor for storing images, you can choose the compression technique for the image samples using the `sample_compression` input:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6y68OihNPPTv" - }, - "source": [ - "import hub\n", - "\n", - "ds = hub.empty('./compression_test')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "uOw9hc0jDpQY" - }, - "source": [ - "ds.create_tensor(\"images_example\", htype = \"image\", sample_compression = \"jpeg\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nv4ktXoCE2K2" - }, - "source": [ - "In this example, every image added in subsequent `.append(...)` calls is compressed using the specified `sample_compression` method. If the source data is already in the correct compression format, it is saved as-is. Otherwise, it is recompressed to the specified format, as described in detail below. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8WaFBxrEE9GI" - }, - "source": [ - "#### **When choosing the optimal compression, the primary tradeoffs are lossiness, memory, and runtime:**" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yM8VtZ98FCUu" - }, - "source": [ - "**Lossiness** - Certain compression techniques are lossy, meaning that there is irreversible information loss when saving the data in the compressed format. \n", - "\n", - "**Memory** - Different compression techniques have substantially different memory footprints. For instance, `png` vs `jpeg` compression may result in a 10X difference in the size of a Hub dataset. \n", - "\n", - "**Runtime** - The highest uploads speeds can be achieved when the `sample_compression` value matches the compression of the source data, such as:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "AylKYTVCE0eI" - }, - "source": [ - "# sample_compression and my_image are \"jpeg\"\n", - "ds.create_tensor(\"images_jpeg\", htype = \"image\", sample_compression = \"jpeg\")\n", - "ds.images_jpeg.append(hub.read(\"/content/animals/dogs/image_3.jpg\"))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hotuAwslFbAu" - }, - "source": [ - "However, a mismatch between compression of the source data and `sample_compression` in Hub results in significantly slower upload speeds, because Hub must decompress the source data and recompress it using the specified `sample_compression` before saving:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "qkJKv00UFexo" - }, - "source": [ - "# sample_compression is \"png\" and my_image is \"jpeg\"\n", - "ds.create_tensor(\"images_png\", htype = \"image\", sample_compression = \"png\")\n", - "ds.images_png.append(hub.read(\"/content/animals/dogs/image_3.jpg\"))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3LMsd3K9GJJ9" - }, - "source": [ - "**Note:** Therefore, due to the computational costs associated with decompressing and recompressing data, it is important that you consider the runtime implications of uploading source data that is compressed differently than the specified `sample_compression`. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JGo-E8Z8Ho6F" - }, - "source": [ - "# **Step 4**: _Accessing Data_\n", - "_Accessing and loading Hub Datasets._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "A8Mye_Z5Htut" - }, - "source": [ - "## Loading Datasets" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0DI_D7flHvEN" - }, - "source": [ - "Hub Datasets can be loaded and created in a variety of storage locations with minimal configuration. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "I9dl3mfENulO" - }, - "source": [ - "import hub" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "sltdan65HmRN" - }, - "source": [ - "# Local Filepath\n", - "ds = hub.load('./animals_hub') # Dataset created in Step 2 in this Colab Notebook" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "41FBvx25NWMN" - }, - "source": [ - "# S3\n", - "# ds = hub.load('s3://my_dataset_bucket', creds={...})" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "PuacdMOgNNmT" - }, - "source": [ - "# Public Dataset hosted by Activeloop\n", - "ds = hub.load('hub://activeloop/k49-train')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "ocs18sNqNQfG" - }, - "source": [ - "# Dataset in another workspace on Activeloop Platform\n", - "# ds = hub.load('hub://workspace_name/dataset_name')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZD60qFaAH2qg" - }, - "source": [ - "**Note:** Since `ds = hub.dataset(path)` can be used to both create and load datasets, you may accidentally create a new dataset if there is a typo in the path you provided while intending to load a dataset. If that occurs, simply use `ds.delete()` to remove the unintended dataset permanently." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1Kb9q_ZqIARN" - }, - "source": [ - "## Referencing Tensors" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bq5WSI5LIClV" - }, - "source": [ - "Hub allows you to reference specific tensors using keys or via the `.` notation outlined below. \n", - "\n", - "\n", - "**Note:** data is still not loaded by these commands." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "jr_ZEtBnN1Wp" - }, - "source": [ - "ds = hub.dataset('hub://activeloop/k49-train')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "24trRqlLH0Tl" - }, - "source": [ - "### NO HIERARCHY ###\n", - "ds.images # is equivalent to\n", - "ds['images']\n", - "\n", - "ds.labels # is equivalent to\n", - "ds['labels']\n", - "\n", - "### WITH HIERARCHY ###\n", - "# ds.localization.boxes # is equivalent to\n", - "# ds['localization/boxes']\n", - "\n", - "# ds.localization.labels # is equivalent to\n", - "# ds['localization/labels']" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bjmnRLWHINXG" - }, - "source": [ - "## Accessing Data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "js3jsmBHIPqu" - }, - "source": [ - "Data within the tensors is loaded and accessed using the `.numpy()` command:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6QUWjQNGILWQ" - }, - "source": [ - "# Indexing\n", - "ds = hub.dataset('hub://activeloop/k49-train')\n", - "\n", - "W = ds.images[0].numpy() # Fetch an image and return a NumPy array\n", - "X = ds.labels[0].numpy(aslist=True) # Fetch a label and store it as a \n", - " # list of NumPy arrays\n", - "\n", - "# Slicing\n", - "Y = ds.images[0:100].numpy() # Fetch 100 images and return a NumPy array\n", - " # The method above produces an exception if \n", - " # the images are not all the same size\n", - "\n", - "Z = ds.labels[0:100].numpy(aslist=True) # Fetch 100 labels and store \n", - " # them as a list of NumPy arrays" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DykgrsBEIfk1" - }, - "source": [ - "**Note:** The `.numpy()` method will produce an exception if all samples in the requested tensor do not have a uniform shape. If that's the case, running `.numpy(aslist=True)` solves the problem by returning a list of NumPy arrays, where the indices of the list correspond to different samples. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NQipSo2OF_lB" - }, - "source": [ - "# **Step 5**: _Using Activeloop Storage_\n", - "\n", - "_Storing and loading datasets from Activeloop Platform Storage._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bA39G647GHX4" - }, - "source": [ - "You can store your Hub Datasets on Activeloop Platform by first creating an account in the CLI using:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "PCDC-5dmGFdJ" - }, - "source": [ - "!activeloop register" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "o1iZpxtOGJ0N" - }, - "source": [ - "In order for the Python API to authenticate with the Activeloop Platform, you should log in from the CLI using:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Z0OUCCMGGLv0" - }, - "source": [ - "!activeloop login -u username -p password\n", - "\n", - "# Alternatively use \"activeloop login\" ... which is followed by prompts for username and password" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FvBxhaAYGNOi" - }, - "source": [ - "You can then access or create Hub Datasets by passing the Activeloop Platform path to `hub.dataset()`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "FeL0a2zwGXeU" - }, - "source": [ - "import hub\n", - "\n", - "# platform_path = 'hub://workspace_name/dataset_name'\n", - "# 'hub://jane_smith/my_awesome_dataset'\n", - " \n", - "ds = hub.dataset(platform_path)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "huQQ1M8kGcyL" - }, - "source": [ - "**Note**: \n", - "\n", - "When you create an account in Activeloop Platform, a default workspace is created that has the same name as your username. You are also able to create other workspaces that represent organizations, teams, or other collections of multiple users. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vUdVLQUGGnsA" - }, - "source": [ - "Public datasets such as `'hub://activeloop/mnist-train'` can be accessed without logging in." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LVma__gxGq97" - }, - "source": [ - "# **Step 6**: _Connecting Hub Datasets to ML Frameworks_\n", - "\n", - "_Connecting Hub Datasets to machine learning frameworks such as PyTorch and TensorFlow._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8r-AkeJMGwxB" - }, - "source": [ - "You can connect Hub Datasets to popular ML frameworks such as PyTorch and TensorFlow using minimal boilerplate code, and Hub takes care of the parallel processing!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Bnr9ItdkGzDk" - }, - "source": [ - "## PyTorch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wKkrCv2NG1GG" - }, - "source": [ - "You can train a model by creating a PyTorch DataLoader from a Hub Dataset using `ds.pytorch()`." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "HP3C2uoAGnNK" - }, - "source": [ - "import hub\n", - "from torch.utils.data import DataLoader\n", - "\n", - "ds = hub.dataset('hub://activeloop/cifar100-train') # Hub Dataset\n", - "dataloader = ds.pytorch(batch_size = 16, num_workers = 2) #PyTorch DataLoader\n", - "\n", - "for data in dataloader:\n", - " print(data)\n", - " break\n", - " # Training Loop" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "x5bX92ZUG_2F" - }, - "source": [ - "## TensorFlow" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jeRUG-arHP1F" - }, - "source": [ - "Similarly, you can convert a Hub Dataset to a TensorFlow Dataset via the `tf.Data` API. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "I1bma0HSHOAO" - }, - "source": [ - "ds # Hub Dataset object, to be used for training\n", - "ds_tf = ds.tensorflow() # A TensorFlow Dataset" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "guao84xTb4Zg" - }, - "source": [ - "# **Step 7**: _Parallel Computing_\n", - "\n", - "_Running computations and processing data in parallel._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BVcZ28epcKRc" - }, - "source": [ - "Hub enables you to easily run computations in parallel and significantly accelerate your data processing workflows. This example primarily focuses on parallel dataset uploading, and other use cases such as dataset transformations can be found in [this tutorial](https://docs.activeloop.ai/tutorials/data-processing-using-parallel-computing).\n", - "\n", - "Parallel compute using Hub has two core elements: #1. defining a function or pipeline that will run in parallel and #2. evaluating it using the appropriate inputs and outputs. Let's start with #1 by defining a function that processes files and appends their data to the labels and images tensors. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZWNxzF1pcWxn" - }, - "source": [ - "**Defining the parallel computing function**\n", - "\n", - "The first step for running parallel computations is to define a function that will run in parallel by decorating it using `@hub.compute`. In the example below, `file_to_hub` converts data from files into hub format, just like in **Step 2: Creating Hub Datasets Manually**. If you have not completed Step 2, please complete the section that downloads and unzips the *animals* dataset" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "JMjMF_-LcHtl" - }, - "source": [ - "import hub\n", - "from PIL import Image\n", - "import numpy as np\n", - "import os\n", - "\n", - "@hub.compute\n", - "def file_to_hub(file_name, sample_out, class_names):\n", - " ## First two arguments are always default arguments containing:\n", - " # 1st argument is an element of the input iterable (list, dataset, array,...)\n", - " # 2nd argument is a dataset sample\n", - " # Other arguments are optional\n", - " \n", - " # Find the label number corresponding to the file\n", - " label_text = os.path.basename(os.path.dirname(file_name))\n", - " label_num = class_names.index(label_text)\n", - " \n", - " # Append the label and image to the output sample\n", - " sample_out.labels.append(np.uint32(label_num))\n", - " sample_out.images.append(hub.read(file_name))\n", - " \n", - " return sample_out" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d-ZhXH-pcgT8" - }, - "source": [ - "In all functions decorated using `@hub.compute`, the first argument must be a single element of any input iterable that is being processed in parallel. In this case, that is a filename `file_name`, becuase `file_to_hub` reads image files and populates data in the dataset's tensors. \n", - "\n", - "The second argument is a dataset sample `sample_out`, which can be operated on using similar syntax to dataset objects, such as `sample_out.append(...)`, `sample_out.extend(...)`, etc.\n", - "\n", - "The function decorated using `@hub.compute` must return `sample_out`, which represents the data that is added or modified by that function." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TIUiNuQqchnH" - }, - "source": [ - "**Executing the transform**\n", - "\n", - "To execute the transform, you must define the dataset that will be modified by the parallel computation." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "TZfEn1g_cno_" - }, - "source": [ - "ds = hub.empty('./animals_hub_transform') # Creates the dataset" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "u7FIReeLcpka" - }, - "source": [ - "Next, you define the input iterable that describes the information that will be operated on in parallel. In this case, that is a list of files `files_list` from the animals dataset in Step 2." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8CwypbTxcrx0" - }, - "source": [ - "# Find the class_names and list of files that need to be uploaded\n", - "dataset_folder = './animals'\n", - "\n", - "class_names = os.listdir(dataset_folder)\n", - "\n", - "files_list = []\n", - "for dirpath, dirnames, filenames in os.walk(dataset_folder):\n", - " for filename in filenames:\n", - " files_list.append(os.path.join(dirpath, filename))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5IC-VRKVcuRI" - }, - "source": [ - "You can now create the tensors for the dataset and **run the parallel computation** using the `.eval` syntax. Pass the optional input arguments to `file_to_hub`, and we skip the first two default arguments `file_name` and `sample_out`. \n", - "\n", - "The input iterable `files_list` and output dataset `ds` is passed to the `.eval` method as the first and second argument respectively." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "p4H4Fug0cxJG" - }, - "source": [ - "with ds:\n", - " ds.create_tensor('images', htype = 'image', sample_compression = 'jpeg')\n", - " ds.create_tensor('labels', htype = 'class_label', class_names = class_names)\n", - " \n", - " file_to_hub(class_names=class_names).eval(files_list, ds, num_workers = 2)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "BfWc3_fkhr0W" - }, - "source": [ - "Image.fromarray(ds.images[0].numpy())" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5xTj7kt0jrd3" - }, - "source": [ - "Congrats! You just created a dataset using parallel computing! 🎈" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iXRCphquSFs3" - }, - "source": [ - "# **Step 8**: _Version Control_\n", - "\n", - "_Running computations and processing data in parallel._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4y_V53L8SCuB" - }, - "source": [ - "Hub version control allows user to manage changes to datasets with commands very similar to Git. It provides critical insights into how data is evolving, and it works with datasets of any size!\n", - "\n", - "\n", - "Let's create a hub dataset and check out how version control works!" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "YgEWowxySUDL" - }, - "source": [ - "import hub\n", - "import numpy as np\n", - "\n", - "# Set overwrite = True for re-runability\n", - "ds = hub.dataset('./version_control', overwrite = True)\n", - "\n", - "# Create a tensor and append 200X 100x100x3 arrays\n", - "with ds:\n", - " ds.create_tensor('images', htype = 'image', sample_compression = 'jpeg')\n", - " ds.images.extend(np.ones((200, 100, 100, 3), dtype = 'uint8'))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_CEF-kjySdLp" - }, - "source": [ - "##Commit" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "joKq3VV0SdEW" - }, - "source": [ - "To commit the data added above, simply run `ds.commit`:\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "pj9uTZeSTGwT" - }, - "source": [ - "first_commit_id = ds.commit('Added 200X 100x100x3 arrays')\n", - "\n", - "print('Dataset in commit {} has {} samples'.format(first_commit_id, len(ds)))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Tc2-MRmaSc4x" - }, - "source": [ - "The printout shows that the first commit has 200 samples. Next, let's add 50X more samples and commit the update:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "zArtG0phTZRv" - }, - "source": [ - "with ds:\n", - " ds.images.extend(np.ones((50, 150, 150, 3), dtype = 'uint8'))\n", - " \n", - "second_commit_id = ds.commit('Added 50X 150x150x3 arrays')\n", - "print('Dataset in commit {} has {} samples'.format(second_commit_id, len(ds)))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fYjnY_1RTcjM" - }, - "source": [ - "The printout now shows that the second commit has 250 samples. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kWvgUH25Tj8V" - }, - "source": [ - "##Log" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CiqOb8POTkb4" - }, - "source": [ - "The commit history starting from the current commit can be show using `ds.log`:\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "XQSxvzIcTuU-" - }, - "source": [ - "log = ds.log()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TgefyAuATwi4" - }, - "source": [ - "This command prints the log to the console and also assigns it to the specified variable log. The author of the commit is the username of the [Activeloop account](https://docs.activeloop.ai/getting-started/using-activeloop-storage) that logged in on the machine." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2JRpqeYqV-oT" - }, - "source": [ - "##Branch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4TWcOT4RV-d4" - }, - "source": [ - "Branching takes place by running the `ds.checkout` command with the parameter `create = True` . Let's create a new branch, add a `labels` tensor, populate it with data, create a new commit on that branch, and display the log." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "eY-CZmzrXr0X" - }, - "source": [ - "ds.checkout('new_branch', create = True)\n", - "\n", - "with ds:\n", - " ds.create_tensor('labels', htype = 'class_label')\n", - " ds.labels.extend(np.zeros((250,1), dtype = 'uint32'))\n", - " \n", - "new_branch_commit_id = ds.commit('Added labels tensor and 250X labels')\n", - "print('Dataset in commit {} has tensors: {}'.format(new_branch_commit_id, ds.tensors))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VUUMXFKEXuIq" - }, - "source": [ - "The printout shows that the dataset on the `new_branch` branch contains `images` and `labels` tensors.\n", - "\n", - "\n", - "The log now shows a commit on `new_branch` as well as the previous commits on the `main`:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "O3-UgHZPX_0u" - }, - "source": [ - "ds.log()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HCrKgp6FYDG9" - }, - "source": [ - "##Checkout" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "07nHcIIiYFtW" - }, - "source": [ - "A previous commit of branch can be checked out using `ds.checkout`:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "YZe8iXjlYEdf" - }, - "source": [ - "ds.checkout('main')\n", - "\n", - "print('Dataset in branch {} has tensors: {}'.format('main', ds.tensors))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7AZXuEVYYVHm" - }, - "source": [ - "As expected, the printout shows that the dataset on `main` only contains the `images` tensor, since the `labels` tensor was added on `new_branch`." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i1GqH1JvYkNP" - }, - "source": [ - "##HEAD Commit\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RbiRZ0eGiBrz" - }, - "source": [ - "Unlike Git, Hub's version control does not have a staging area because changes to datasets are not stored locally before they are committed. All changes are automatically reflected in the dataset's permanent storage (local or cloud). **Therefore, any changes to a dataset are automatically stored in a HEAD commit on the current branch**. This means that the uncommitted changes do not appear on other branches. Let's see how this works:\n", - "\n", - "You should currently be on the `main` branch, which has 250 samples. Let's add 75 more samples:\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "FwuzyJUViZC6" - }, - "source": [ - "print('Dataset on {} branch has {} samples'.format('main', len(ds)))\n", - "\n", - "with ds:\n", - " ds.images.extend(np.zeros((75, 100, 100, 3), dtype = 'uint8'))\n", - " \n", - "print('After updating, the HEAD commit on {} branch has {} samples'.format('main', len(ds)))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4brOnBdyiq6p" - }, - "source": [ - "Next, if you checkout the first commit, the dataset contains 200 samples, which is sample count from when the first commit was made. Therefore, the 75 uncommitted samples that were added to the `main` branch above are not reflected when other branches or commits are checked out." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "cvG-X9VqipM3" - }, - "source": [ - "ds.checkout(first_commit_id)\n", - "\n", - "print('Dataset in commit {} has {} samples'.format(first_commit_id, len(ds)))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7aoAeA7vixsC" - }, - "source": [ - "Finally, when checking our the `main` branch again, the prior uncommitted changes and visible and they are stored in the `HEAD` commit on `main`:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6DnXiwTmi6G9" - }, - "source": [ - "ds.checkout('main')\n", - "\n", - "print('Dataset in {} branch has {} samples'.format('main', len(ds)))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EnTPLIS5i7yE" - }, - "source": [ - "##Diff - Coming Soon" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JB79lLLJi7p4" - }, - "source": [ - "Understanding changes between commits is critical for managing the evolution of datasets. The `diff` function will enable users to determine the number of samples that were added, removed, or updated for each tensor. Activeloop is currently working on an implementation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uinXs4r1i7Zz" - }, - "source": [ - "##Merge - Coming Soon\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kQOGilvkjG2c" - }, - "source": [ - "Merging is a critical feature for collaborating on datasets, and Activeloop is currently working on an implementation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Fz15ukH5jiIm" - }, - "source": [ - "Congrats! You just are now an expert in dataset version control!πŸŽ“" - ] - } - ] -} diff --git a/colabs/Training_an_Image_Classification_Model_in_PyTorch.ipynb b/colabs/Training_an_Image_Classification_Model_in_PyTorch.ipynb deleted file mode 100644 index e39e7fb..0000000 --- a/colabs/Training_an_Image_Classification_Model_in_PyTorch.ipynb +++ /dev/null @@ -1,390 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "Training an Image Classification Model in PyTorch", - "provenance": [], - "collapsed_sections": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lKU8kmSs65xv" - }, - "source": [ - "# ***Training an Image Classification Model in PyTorch***\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3zK9b4yiMRzB" - }, - "source": [ - "#### The primary objective for Hub is to enable users to manage their data more easily so they can train better ML models. This tutorial shows you how to train a simple image classification model while streaming data from a Hub dataset stored in the cloud." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3UseHLcoRIYz" - }, - "source": [ - "## Install Hub" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "l5mOffq5RN-T" - }, - "source": [ - "from IPython.display import clear_output\n", - "!pip3 install hub\n", - "clear_output()" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "SOkA83IsRWYo" - }, - "source": [ - "# IMPORTANT - Please restart your Colab runtime after installing Hub!\n", - "# This is a Colab-specific issue that prevents PIL from working properly.\n", - "import os\n", - "os.kill(os.getpid(), 9)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7wGo53ndMTCB" - }, - "source": [ - "## Data Preprocessing\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "52h9xKujOJFs" - }, - "source": [ - "The first step is to select a dataset for training. This tutorial uses the [Fashion MNIST](https://github.com/zalandoresearch/fashion-mnist) dataset that has already been converted into hub format. It is a simple image classification dataset that categorizes images by clothing type (trouser, shirt, etc.)" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "neD2jhKDQ5WD" - }, - "source": [ - "import hub\n", - "from PIL import Image\n", - "import numpy as np\n", - "import os, time\n", - "import torch\n", - "from torchvision import datasets, transforms, models\n", - "\n", - "ds_train = hub.load('hub://activeloop/fashion-mnist-train')\n", - "ds_test = hub.load('hub://activeloop/fashion-mnist-test')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "m0jtotSNzeJ0" - }, - "source": [ - "Image.fromarray(ds_train.images[0].numpy()).resize((100,100))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "jPSz9kml03Aa" - }, - "source": [ - "print(ds_train.labels.info.class_names[str(ds_train.labels[0].numpy()[0])])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Np5fIbViHlCu" - }, - "source": [ - "The next step is to define a transformation function that will process the data and convert it into a format that can be passed into a deep learning model. The syntax for the transformation function is that the input parameter is a sample from a Hub dataset in dictionary syntax, and the return value is a dictionary containing the data that the training loop uses to train the model. In this particular example, `torchvision.transforms` is used as a part of the transformation pipeline that performs operations such as normalization and image augmentation (rotation)." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "WqdWgumwQ1d6" - }, - "source": [ - "def transform(sample_in):\n", - " return {'images': tform(sample_in['images']), 'labels': sample_in['labels']}\n", - "\n", - "tform = transforms.Compose([\n", - " transforms.ToPILImage(), # Must convert to PIL image for subsequent operations to run\n", - " transforms.RandomRotation(20), # Image augmentation\n", - " transforms.ToTensor(), # Must convert to pytorch tensor for subsequent operations to run\n", - " transforms.Normalize([0.5], [0.5]),\n", - "])" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ToNQ3WwfIJZf" - }, - "source": [ - "**Note:** Don't worry if the above syntax is a bit confusing 😡! We're currently improving it." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DGmWr44PIQMk" - }, - "source": [ - "You are now ready to create a pytorch dataloader that connects the Hub dataset to the PyTorch model. This can be done using the provided method `ds.pytorch()` , which automatically applies the user-defined transformation function, takes care of random shuffling (if desired), and converts hub data to PyTorch tensors. The `num_workers` parameter can be used to parallelize data preprocessing, which is critical for ensuring that preprocessing does not bottleneck the overall training workflow." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "MeiU4LobROdE" - }, - "source": [ - "batch_size = 32\n", - "\n", - "train_loader = ds_train.pytorch(num_workers = 2, shuffle = True, transform = transform, batch_size = batch_size)\n", - "test_loader = ds_test.pytorch(num_workers = 2, transform = transform, batch_size = batch_size)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "2Dco8HW9ROXS" - }, - "source": [ - "# device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "device = torch.device(\"cpu\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Snt5b6qwIZQ_" - }, - "source": [ - "## Model Definition" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e5LZrDU4I1GO" - }, - "source": [ - "This tutorial uses a pre-trained [ResNet18](https://pytorch.org/hub/pytorch_vision_resnet/) neural network from the torchvision.models module, converted to a single-channel network for grayscale images." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "GRBRaROLROUf" - }, - "source": [ - "# Simple model can be trained on a CPU\n", - "device = torch.device(\"cpu\")\n", - "\n", - "net = models.resnet18(pretrained=True)\n", - "# Convert model to grayscale\n", - "net.conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)\n", - "\n", - "# Update the fully connected layer based on the number of classes in the dataset\n", - "net.fc = torch.nn.Linear(net.fc.in_features, len(ds_train.labels.info.class_names))\n", - "\n", - "net.to(device)\n", - "\n", - "# Specity the loss function and optimizer\n", - "criterion = torch.nn.CrossEntropyLoss()\n", - "optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.1)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8sVS5lTFI-gZ" - }, - "source": [ - "## Training the Model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "V65Xr8aBJCUL" - }, - "source": [ - "Helper functions for training and testing the model are defined. Note that the dictionary that is returned by the transform function in the PyTorch dataloader is access here and is passed into the model." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "W6cQJkHeJGtk" - }, - "source": [ - "def train_model(loader, test_loader, model, epochs = 1):\n", - " for epoch in range(epochs): # loop over the dataset multiple times\n", - "\n", - " # Zero the performance stats for each epoch\n", - " running_loss = 0.0\n", - " start_time = time.time()\n", - " total = 0\n", - " correct = 0\n", - " \n", - " for i, data in enumerate(loader):\n", - " # get the inputs; data is a list of [inputs, labels]\n", - " inputs = data['images']\n", - " labels = torch.squeeze(data['labels'])\n", - "\n", - " inputs = inputs.to(device)\n", - " labels = labels.to(device)\n", - "\n", - " # zero the parameter gradients\n", - " optimizer.zero_grad()\n", - "\n", - " # forward + backward + optimize\n", - " outputs = model(inputs.float())\n", - " loss = criterion(outputs, labels)\n", - " loss.backward()\n", - " optimizer.step()\n", - " \n", - " _, predicted = torch.max(outputs.data, 1)\n", - " total += labels.size(0)\n", - " correct += (predicted == labels).sum().item()\n", - " accuracy = 100 * correct / total\n", - " \n", - " # Print performance statistics\n", - " running_loss += loss.item()\n", - " if i % 10 == 0: # print every 10 batches\n", - " batch_time = time.time()\n", - " speed = (i+1)/(batch_time-start_time)\n", - " print('[%d, %5d] loss: %.3f, speed: %.2f, accuracy: %.2f %%' %\n", - " (epoch + 1, i, running_loss, speed, accuracy))\n", - "\n", - " running_loss = 0.0\n", - " \n", - " print('Testing Model Performance')\n", - " test_model(test_loader, model)\n", - "\n", - " print('Finished Training')\n", - " \n", - " \n", - "def test_model(loader, model):\n", - " start_time = time.time()\n", - " total = 0\n", - " correct = 0\n", - " with torch.no_grad():\n", - " for i, data in enumerate(loader):\n", - " # get the inputs; data is a list of [inputs, labels]\n", - " inputs = data['images']\n", - " labels = torch.squeeze(data['labels'])\n", - "\n", - " inputs = inputs.to(device)\n", - " labels = labels.to(device)\n", - "\n", - " # zero the parameter gradients\n", - " optimizer.zero_grad()\n", - "\n", - " # forward + backward + optimize\n", - " outputs = model(inputs.float())\n", - "\n", - " _, predicted = torch.max(outputs.data, 1)\n", - " total += labels.size(0)\n", - " correct += (predicted == labels).sum().item()\n", - " accuracy = 100 * correct / total\n", - " \n", - " print('Finished Testing')\n", - " print('Testing accuracy: %.1f %%' %(accuracy))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vQWzFjzLJINu" - }, - "source": [ - "The model and data are ready for training. Let's gooooooooooo πŸš€!" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "fMhm4VjDRf7i" - }, - "source": [ - "train_model(train_loader, test_loader, net, epochs = 1)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "79QnkE-UUySP" - }, - "source": [ - "Congrats! You successfully trained an image classification model while streaming data directly from the cloud! πŸŽ‰" - ] - } - ] -} diff --git a/datasets/places365/upload.py b/datasets/places365/upload.py deleted file mode 100644 index 8bb93d3..0000000 --- a/datasets/places365/upload.py +++ /dev/null @@ -1,115 +0,0 @@ -import hub -import numpy as np -from PIL import Image -import argparse -import tqdm -import time - -import traceback -import sys - -import logging - -import torchvision.datasets as datasets - -NUM_WORKERS = 1 -DS_OUT_PATH = "./data/places365" # optionally s3://, gcs:// or hub:// path -DOWNLOAD = False -splits = [ - "train-standard", - # "val", - # "train-challenge" -] - -parser = argparse.ArgumentParser(description="Hub Places365 Uploading") -parser.add_argument("data", metavar="DIR", help="path to dataset") -parser.add_argument( - "--num_workers", - type=int, - default=NUM_WORKERS, - metavar="O", - help="number of workers to allocate", -) -parser.add_argument( - "--ds_out", - type=str, - default=DS_OUT_PATH, - metavar="O", - help="dataset path to be transformed into", -) - -parser.add_argument( - "--download", - type=bool, - default=DOWNLOAD, - metavar="O", - help="Download from the source http://places2.csail.mit.edu/download.html", -) - -args = parser.parse_args() - - -def define_dataset(path: str, class_names: list = []): - ds = hub.empty(path, overwrite=True) - - ds.create_tensor("images", htype="image", sample_compression="jpg") - ds.create_tensor("labels", htype="class_label", class_names=class_names) - - return ds - - -@hub.compute -def upload_parallel(pair_in, sample_out): - filepath, target = pair_in[0], pair_in[1] - try: - img = Image.open(filepath) - if len(img.size) == 2: - img = img.convert("RGB") - arr = np.asarray(img) - sample_out.images.append(arr) - sample_out.labels.append(target) - except Exception as e: - logging.error(f"failed uploading {filepath} with target {target}") - - -def upload_iteration(filenames_target: list, ds: hub.Dataset): - with ds: - for filepath, target in tqdm.tqdm(filenames_target): - try: - img = Image.open(filepath) - if len(img.size) == 2: - img = img.convert("RGB") - arr = np.asarray(img) - ds.images.append(arr) - ds.labels.append(target) - except Exception as e: - logging.error(f"failed uploading {filepath} with target {target}") - - -if __name__ == "__main__": - - for split in splits: - torch_dataset = datasets.Places365( - args.data, - split=split, - download=args.download, - ) - categories = torch_dataset.load_categories()[0] - categories = list(map(lambda x: "/".join(x.split("/")[2:]), categories)) - ds = define_dataset(f"{args.ds_out}-{split}", categories) - filenames_target = torch_dataset.load_file_list() - - print(f"uploading {split}...") - t1 = time.time() - if args.num_workers > 1: - - upload_parallel().eval( - filenames_target[0], - ds, - num_workers=args.num_workers, - scheduler="processed", - ) - else: - upload_iteration(filenames_target[0], ds) - t2 = time.time() - print(f"uploading {split} took {t2-t1}s") diff --git a/hub_examples/uploading/npy.py b/hub_examples/uploading/npy.py new file mode 100644 index 0000000..8acb1a5 --- /dev/null +++ b/hub_examples/uploading/npy.py @@ -0,0 +1,35 @@ +import hub +import numpy as np +from PIL import Image + + +def upload(uri: str): + """Upload some numpy data!""" + + ds = hub.empty(uri, overwrite=True) + + # initialize tensors + ds.create_tensor("x") + ds.create_tensor("images", htype="image", sample_compression="png") + + # add some uncompressed numpy data + ds.x.append(np.ones((10, 10))) + + # add some numpy data and compress as PNG + data = np.random.randint(low=0, high=256, size=(100, 100, 100, 3), dtype="uint8") + ds.images.extend(data) + + +def visualize(uri: str): + """Visualize some numpy data!""" + + ds = hub.load(uri, read_only=True) + + Image.fromarray(ds.images[0].numpy()).show() + Image.fromarray(ds.x[0].numpy()).show() + + +if __name__ == "__main__": + uri = "./_datasets/npy" + upload(uri) + visualize(uri) \ No newline at end of file From ed84577e103361bf2ba14a77718e3ba281e70767 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 8 Oct 2021 15:25:21 -0700 Subject: [PATCH 03/16] comments --- hub_examples/uploading/npy.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hub_examples/uploading/npy.py b/hub_examples/uploading/npy.py index 8acb1a5..a6877ad 100644 --- a/hub_examples/uploading/npy.py +++ b/hub_examples/uploading/npy.py @@ -12,12 +12,12 @@ def upload(uri: str): ds.create_tensor("x") ds.create_tensor("images", htype="image", sample_compression="png") - # add some uncompressed numpy data + # add some numpy data and keep uncompressed ds.x.append(np.ones((10, 10))) # add some numpy data and compress as PNG data = np.random.randint(low=0, high=256, size=(100, 100, 100, 3), dtype="uint8") - ds.images.extend(data) + ds.images.extend(data) # 100 random images, each 100x100x3 def visualize(uri: str): @@ -25,11 +25,12 @@ def visualize(uri: str): ds = hub.load(uri, read_only=True) - Image.fromarray(ds.images[0].numpy()).show() Image.fromarray(ds.x[0].numpy()).show() + Image.fromarray(ds.images[0].numpy()).show() if __name__ == "__main__": uri = "./_datasets/npy" + upload(uri) visualize(uri) \ No newline at end of file From a786ed4f48e1a5c517fee4983deda81b7c09efb2 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Fri, 8 Oct 2021 15:25:27 -0700 Subject: [PATCH 04/16] start imagenet --- hub_examples/training/imagenet.py | 51 +++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 hub_examples/training/imagenet.py diff --git a/hub_examples/training/imagenet.py b/hub_examples/training/imagenet.py new file mode 100644 index 0000000..6e7fed3 --- /dev/null +++ b/hub_examples/training/imagenet.py @@ -0,0 +1,51 @@ +from typing import Tuple +import hub +import torch + +from tqdm import tqdm + + +def transform_sample(sample: dict) -> Tuple[torch.Tensor, torch.Tensor]: + """Hub samples are a dictionary that maps a tensor's key -> hub.Tensor.""" + + image = sample["images"].numpy() + label = sample["labels"].numpy() + + x = torch.tensor(image).float() + x = torch.transpose(x, 0, 2) # (H, W, C) -> (C, W, H) + y = torch.tensor(label).int() + return x, y + + +if __name__ == "__main__": + dataloader_workers = 2 + max_samples = 10 # number of samples to use for training + batch_size = 1 + learning_rate = 0.01 + + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True).to(device) + + imagenet = hub.load("hub://jayanth/imagenet-2019-PIL-210914-train")[:max_samples] + dataloader = imagenet.pytorch(num_workers=dataloader_workers, transform=transform_sample, batch_size=batch_size) + + optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + criterion = torch.nn.CrossEntropyLoss() + + # feedback: progress bar parameter for pytorch would be nice. saves some boilerplate + for X, T in dataloader: + X = X.to(device) + T = T.to(device) + + optimizer.zero_grad() + + Y = model(X) + # P = torch.nn.functional.softmax(Y, dim=1) + + # print(X.shape, Y.shape, P.shape, T.shape) + + loss = criterion(Y, T) + # loss.backward() + + # optimizer.step() + pass From 147bd3f70d18e5be8233e0fcf34136c5d4ce1161 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Sat, 9 Oct 2021 14:18:02 -0700 Subject: [PATCH 05/16] update link --- hub_examples/training/imagenet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hub_examples/training/imagenet.py b/hub_examples/training/imagenet.py index 6e7fed3..10a3ae8 100644 --- a/hub_examples/training/imagenet.py +++ b/hub_examples/training/imagenet.py @@ -26,7 +26,8 @@ def transform_sample(sample: dict) -> Tuple[torch.Tensor, torch.Tensor]: device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True).to(device) - imagenet = hub.load("hub://jayanth/imagenet-2019-PIL-210914-train")[:max_samples] + # TODO: imagenet coming soon! + imagenet = hub.load("hub://activeloop/imagenet")[:max_samples] dataloader = imagenet.pytorch(num_workers=dataloader_workers, transform=transform_sample, batch_size=batch_size) optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) From ec6269c77289dbd37ddd6c7a4348df0c61ac8b3d Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Sat, 9 Oct 2021 14:31:35 -0700 Subject: [PATCH 06/16] begin section 3 from KD --- .../training/knowledge_distillation.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 hub_examples/training/knowledge_distillation.py diff --git a/hub_examples/training/knowledge_distillation.py b/hub_examples/training/knowledge_distillation.py new file mode 100644 index 0000000..96e515f --- /dev/null +++ b/hub_examples/training/knowledge_distillation.py @@ -0,0 +1,32 @@ +import torch + +# NOTE: `PAPER:` denotes a quote from the cited paper. + +# reference paper: https://arxiv.org/pdf/1503.02531.pdf +# this script implements section 3: "Preliminary experiments on MNIST" + +# PAPER: "a single large neural net with two hidden layers of 1200 rectified linear hidden units" +# what is a hidden layer? https://medium.com/fintechexplained/what-are-hidden-layers-4f54f7328263 +net = torch.nn.Sequential( + torch.nn.Linear(784, 1200), # hidden layer 1 weights + torch.nn.ReLU(), + torch.nn.Linear(1200, 1200), # hidden layer 2 weights + torch.nn.ReLU(), + torch.nn.Linear(1200, 10), # output layer weights +) + +big_net = torch.nn.Sequential( + torch.nn.Linear(2, 10), + torch.nn.ReLU(), + torch.nn.Linear(10, 1), +) + +# PAPER: "a smaller net with two hidden layers of 800 rectified linear hidden units and no regularization" +# TODO + +# PAPER: "the net was strongly regularized using dropout and weight-constraints as described in [5]" +# TODO + + +# PAPER: "in addition, the input images were jittered by up to two pixels in any direction" +# TODO From 8008239ceea80c832be7d9d35de7ea1b885a94a0 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Sat, 9 Oct 2021 14:47:59 -0700 Subject: [PATCH 07/16] define networks and citations --- .../training/knowledge_distillation.py | 60 ++++++++++++------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/hub_examples/training/knowledge_distillation.py b/hub_examples/training/knowledge_distillation.py index 96e515f..2d9a9e5 100644 --- a/hub_examples/training/knowledge_distillation.py +++ b/hub_examples/training/knowledge_distillation.py @@ -1,32 +1,48 @@ import torch +from torch.nn import Sequential, Linear, ReLU, Dropout -# NOTE: `PAPER:` denotes a quote from the cited paper. -# reference paper: https://arxiv.org/pdf/1503.02531.pdf -# this script implements section 3: "Preliminary experiments on MNIST" +# PAPER1: https://arxiv.org/pdf/1503.02531.pdf (knowledge distillation, section 3) +# PAPER2: https://arxiv.org/pdf/1207.0580.pdf (preventing co-adaption, constraints) -# PAPER: "a single large neural net with two hidden layers of 1200 rectified linear hidden units" -# what is a hidden layer? https://medium.com/fintechexplained/what-are-hidden-layers-4f54f7328263 -net = torch.nn.Sequential( - torch.nn.Linear(784, 1200), # hidden layer 1 weights - torch.nn.ReLU(), - torch.nn.Linear(1200, 1200), # hidden layer 2 weights - torch.nn.ReLU(), - torch.nn.Linear(1200, 10), # output layer weights -) +# NOTE: this script implements section 3: "Preliminary experiments on MNIST" from PAPER1 -big_net = torch.nn.Sequential( - torch.nn.Linear(2, 10), - torch.nn.ReLU(), - torch.nn.Linear(10, 1), -) +"""Defining the first "big" network -# PAPER: "a smaller net with two hidden layers of 800 rectified linear hidden units and no regularization" -# TODO +PAPER1: + "a single large neural net with two hidden layers of 1200 rectified linear hidden units" + "the net was strongly regularized using dropout and weight-constraints as described in [5]" -# PAPER: "the net was strongly regularized using dropout and weight-constraints as described in [5]" -# TODO +External ref: + what is a hidden layer? https://medium.com/fintechexplained/what-are-hidden-layers-4f54f7328263 +PAPER2: + "each hidden unit is randomly omitted from the network with a probability of 0.5" + NOTE This implementation ignores this step: + "All layers had L2 weight constraints on the incoming weights of each hidden unit" + +""" +dp = 0.5 +big_net = Sequential( + # NOTE: don't dropout input data (probably a bad idea). exercise for the reader: why? + Linear(784, 1200), # hidden layer 1 weights + ReLU(), + Dropout(dp), + Linear(1200, 1200), # hidden layer 2 weights + ReLU(), + Dropout(dp), + Linear(1200, 10), # output layer weights +) + +# PAPER1: "a smaller net with two hidden layers of 800 rectified linear hidden units and no regularization" +# NOTE: btw, both of these networks were generated first try by github copilot...... :_) +small_net = Sequential( + Linear(784, 800), # hidden layer 1 weights + ReLU(), + Linear(800, 800), # hidden layer 2 weights + ReLU(), + Linear(800, 10), # output layer weights +) -# PAPER: "in addition, the input images were jittered by up to two pixels in any direction" +# PAPER1: "in addition, the input images were jittered by up to two pixels in any direction" # TODO From 4304fc449a6d875258d3b7c87682bb6bed121289 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Sat, 9 Oct 2021 15:13:02 -0700 Subject: [PATCH 08/16] train big net and log metrics --- .../training/knowledge_distillation.py | 55 ++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/hub_examples/training/knowledge_distillation.py b/hub_examples/training/knowledge_distillation.py index 2d9a9e5..db2c294 100644 --- a/hub_examples/training/knowledge_distillation.py +++ b/hub_examples/training/knowledge_distillation.py @@ -1,5 +1,10 @@ +import hub +import numpy as np import torch + from torch.nn import Sequential, Linear, ReLU, Dropout +from torch.nn.functional import cross_entropy +from torch.optim import SGD # PAPER1: https://arxiv.org/pdf/1503.02531.pdf (knowledge distillation, section 3) @@ -44,5 +49,51 @@ Linear(800, 10), # output layer weights ) -# PAPER1: "in addition, the input images were jittered by up to two pixels in any direction" -# TODO +# NOTE ignore this step from PAPER1: "in addition, the input images were jittered by up to two pixels in any direction" +def transform(sample): + x = sample["images"].float() + t = sample["labels"].long() + return x, t + + +mnist = hub.load("hub://activeloop/mnist-train")[:256] + + + +# PAPER1: "To see how well distillation works, we trained [... describe nets ...] on all [MNIST] 60,000 training cases" + +def train(net, name: str, epochs=1, batch_size=64, lr=0.01): + dataloader = mnist.pytorch(transform=transform, batch_size=batch_size) + optim = SGD(net.parameters(), lr=lr) + + # you can use hub to log your metrics! + metrics = hub.empty(f"./metrics_{name}", overwrite=True) + loss_epoch_average = metrics.create_tensor("loss_epoch_average", dtype=float) + + + for epoch in range(epochs): + epoch_loss = metrics.create_tensor(f"loss_epoch_{epoch}", dtype=float) + + for batch in dataloader: + optim.zero_grad() + + x, t = batch + + x = x.view(-1, 784) + t = t.view(-1) + + y = net(x) + + loss = cross_entropy(y, t) + loss.backward() + + optim.step() + + epoch_loss.append(loss.item()) + + loss_epoch_average.append(np.mean(epoch_loss)) + + return metrics + +metrics = train(big_net, "big_net") +print(metrics.loss_epoch_average.numpy()) From 30c2e6069c653bd3dd2396e1ea0deb3ed817123f Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Sat, 9 Oct 2021 15:15:18 -0700 Subject: [PATCH 09/16] clean code increase epochs --- .../training/knowledge_distillation.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/hub_examples/training/knowledge_distillation.py b/hub_examples/training/knowledge_distillation.py index db2c294..3e4e348 100644 --- a/hub_examples/training/knowledge_distillation.py +++ b/hub_examples/training/knowledge_distillation.py @@ -51,18 +51,18 @@ # NOTE ignore this step from PAPER1: "in addition, the input images were jittered by up to two pixels in any direction" def transform(sample): - x = sample["images"].float() - t = sample["labels"].long() + x = sample["images"].float().view(-1, 784) + t = sample["labels"].long().view(-1) return x, t -mnist = hub.load("hub://activeloop/mnist-train")[:256] +mnist = hub.load("hub://activeloop/mnist-train") # PAPER1: "To see how well distillation works, we trained [... describe nets ...] on all [MNIST] 60,000 training cases" -def train(net, name: str, epochs=1, batch_size=64, lr=0.01): +def train(net, name: str, epochs=1, batch_size=256, lr=0.01): dataloader = mnist.pytorch(transform=transform, batch_size=batch_size) optim = SGD(net.parameters(), lr=lr) @@ -70,30 +70,25 @@ def train(net, name: str, epochs=1, batch_size=64, lr=0.01): metrics = hub.empty(f"./metrics_{name}", overwrite=True) loss_epoch_average = metrics.create_tensor("loss_epoch_average", dtype=float) - for epoch in range(epochs): epoch_loss = metrics.create_tensor(f"loss_epoch_{epoch}", dtype=float) - for batch in dataloader: + for x, t in dataloader: optim.zero_grad() - x, t = batch - - x = x.view(-1, 784) - t = t.view(-1) - + # predict y = net(x) + # learn loss = cross_entropy(y, t) loss.backward() - optim.step() + # metrics epoch_loss.append(loss.item()) - loss_epoch_average.append(np.mean(epoch_loss)) return metrics -metrics = train(big_net, "big_net") +metrics = train(big_net, "big_net", epochs=10) print(metrics.loss_epoch_average.numpy()) From a92b539d597a76525af34d85f84acc36c7d2105f Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Sat, 9 Oct 2021 15:28:40 -0700 Subject: [PATCH 10/16] print metrics better --- .../training/knowledge_distillation.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/hub_examples/training/knowledge_distillation.py b/hub_examples/training/knowledge_distillation.py index 3e4e348..3201e1a 100644 --- a/hub_examples/training/knowledge_distillation.py +++ b/hub_examples/training/knowledge_distillation.py @@ -51,18 +51,19 @@ # NOTE ignore this step from PAPER1: "in addition, the input images were jittered by up to two pixels in any direction" def transform(sample): - x = sample["images"].float().view(-1, 784) - t = sample["labels"].long().view(-1) + x = sample["images"].float().view(-1) + t = sample["labels"].long() return x, t mnist = hub.load("hub://activeloop/mnist-train") - # PAPER1: "To see how well distillation works, we trained [... describe nets ...] on all [MNIST] 60,000 training cases" - +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") def train(net, name: str, epochs=1, batch_size=256, lr=0.01): + net = net.to(device) + dataloader = mnist.pytorch(transform=transform, batch_size=batch_size) optim = SGD(net.parameters(), lr=lr) @@ -74,6 +75,9 @@ def train(net, name: str, epochs=1, batch_size=256, lr=0.01): epoch_loss = metrics.create_tensor(f"loss_epoch_{epoch}", dtype=float) for x, t in dataloader: + x = x.to(device) + t = t.to(device).view(-1) + optim.zero_grad() # predict @@ -86,9 +90,11 @@ def train(net, name: str, epochs=1, batch_size=256, lr=0.01): # metrics epoch_loss.append(loss.item()) - loss_epoch_average.append(np.mean(epoch_loss)) + mean_loss = np.mean(epoch_loss) + loss_epoch_average.append(mean_loss) + print(f"epoch {epoch} loss: {mean_loss}") return metrics -metrics = train(big_net, "big_net", epochs=10) -print(metrics.loss_epoch_average.numpy()) +big_metrics = train(big_net, "big_net", epochs=10) +print(big_metrics.loss_epoch_average.numpy()) From 2cff8d8d72cd7753608cb56bae1cfd093ace9713 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Sun, 10 Oct 2021 18:05:51 -0700 Subject: [PATCH 11/16] define big net, train, and generate embeddings ds --- .../pytorch/knowledge_distillation/big_net.py | 63 ++++++++++++++++++ .../pytorch/knowledge_distillation/train.py | 66 +++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 hub_examples/pytorch/knowledge_distillation/big_net.py create mode 100644 hub_examples/pytorch/knowledge_distillation/train.py diff --git a/hub_examples/pytorch/knowledge_distillation/big_net.py b/hub_examples/pytorch/knowledge_distillation/big_net.py new file mode 100644 index 0000000..82766ef --- /dev/null +++ b/hub_examples/pytorch/knowledge_distillation/big_net.py @@ -0,0 +1,63 @@ +import pytorch_lightning as pl + +from torch.nn import Sequential, Linear, ReLU, Dropout +from torch.optim import Adam +from torch.nn.functional import cross_entropy + + +# TODO: paper citation + + +class BigNet(pl.LightningModule): + def __init__(self, lr=0.01): + """Defining the first "big" network. + + PAPER1: + "a single large neural net with two hidden layers of 1200 rectified linear hidden units" + "the net was strongly regularized using dropout and weight-constraints as described in [5]" + + External ref: + what is a hidden layer? https://medium.com/fintechexplained/what-are-hidden-layers-4f54f7328263 + + PAPER2: + "each hidden unit is randomly omitted from the network with a probability of 0.5" + NOTE This implementation ignores this step: + "All layers had L2 weight constraints on the incoming weights of each hidden unit" + + """ + + super().__init__() + + self.model = Sequential( + # NOTE: don't dropout input data (probably a bad idea). exercise for the reader: why? + Linear(784, 1200), # hidden layer 1 weights + ReLU(), + Dropout(0.5), + Linear(1200, 1200), # hidden layer 2 weights + ReLU(), + Dropout(0.5), + Linear(1200, 10), # output layer weights + ) + + self.critereon = cross_entropy + self.lr = lr + + def forward(self, x): + return self.model(x) + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.forward(x) + loss = self.critereon(y_hat, y.view(-1)) + + return {"loss": loss} + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self.forward(x) + loss = self.critereon(y_hat, y.view(-1)) + + return {"val_loss": loss} + + def configure_optimizers(self): + return Adam(self.parameters(), lr=self.lr) \ No newline at end of file diff --git a/hub_examples/pytorch/knowledge_distillation/train.py b/hub_examples/pytorch/knowledge_distillation/train.py new file mode 100644 index 0000000..edc8ae5 --- /dev/null +++ b/hub_examples/pytorch/knowledge_distillation/train.py @@ -0,0 +1,66 @@ +import hub + +import torch +import pytorch_lightning as pl + +from big_net import BigNet + + +MAX_SAMPLES = 128 +TRAIN_URI = "hub://activeloop/mnist-train" +TEST_URI = "hub://activeloop/mnist-test" + + + +def get_train_val_loaders(): + mnist_train = hub.load(TRAIN_URI)[:MAX_SAMPLES] + mnist_test = hub.load(TEST_URI)[:MAX_SAMPLES] + + def transform(sample): + x = sample["images"] + x = x.float().view(-1) + t = sample["labels"] + t = t.long() + return x, t + + train = mnist_train.pytorch(transform=transform, shuffle=False, batch_size=128, num_workers=4) + val = mnist_test.pytorch(transform=transform, shuffle=False, batch_size=128, num_workers=4) + + return train, val + + +def train_teacher(model: pl.LightningModule, epochs=1): + trainer = pl.Trainer(max_epochs=epochs) + train, val = get_train_val_loaders() + trainer.fit(model, train, val) + + +def generate_teacher_embedding_dataset(model: pl.LightningModule, output_dataset_uri: str): + mnist_train = hub.load(TRAIN_URI)[:MAX_SAMPLES] + + @hub.compute + def generate_embeddings(sample_in, sample_out): + x = torch.tensor(sample_in.images.numpy()).view(-1).float() + y = model(x).detach().numpy() + + sample_out.images.append(y) + + return sample_out + + embeddings = hub.empty(output_dataset_uri, overwrite=True) + embeddings.create_tensor("images", dtype=float) + + generate_embeddings().eval(mnist_train, embeddings, num_workers=0) + + +def train_learner(): + # TODO: require `train_teacher` to be called first, + # TODO: train the learner model on the new local dataset with the teacher embeddings + raise NotImplementedError + + +if __name__ == "__main__": + big_net = BigNet() + # train_teacher(big_net) + + generate_teacher_embedding_dataset(big_net, "./teacher_embeddings") \ No newline at end of file From 561672c36d69c98b5e100ab496ff7d7cffa8b9e5 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Sun, 10 Oct 2021 18:36:57 -0700 Subject: [PATCH 12/16] implement knowledge distillation --- .../pytorch/knowledge_distillation/big_net.py | 63 ----------- .../pytorch/knowledge_distillation/learner.py | 40 +++++++ .../pytorch/knowledge_distillation/models.py | 47 ++++++++ .../pytorch/knowledge_distillation/teacher.py | 43 ++++++++ .../pytorch/knowledge_distillation/train.py | 69 +++++++++--- .../training/knowledge_distillation.py | 100 ------------------ 6 files changed, 183 insertions(+), 179 deletions(-) delete mode 100644 hub_examples/pytorch/knowledge_distillation/big_net.py create mode 100644 hub_examples/pytorch/knowledge_distillation/learner.py create mode 100644 hub_examples/pytorch/knowledge_distillation/models.py create mode 100644 hub_examples/pytorch/knowledge_distillation/teacher.py delete mode 100644 hub_examples/training/knowledge_distillation.py diff --git a/hub_examples/pytorch/knowledge_distillation/big_net.py b/hub_examples/pytorch/knowledge_distillation/big_net.py deleted file mode 100644 index 82766ef..0000000 --- a/hub_examples/pytorch/knowledge_distillation/big_net.py +++ /dev/null @@ -1,63 +0,0 @@ -import pytorch_lightning as pl - -from torch.nn import Sequential, Linear, ReLU, Dropout -from torch.optim import Adam -from torch.nn.functional import cross_entropy - - -# TODO: paper citation - - -class BigNet(pl.LightningModule): - def __init__(self, lr=0.01): - """Defining the first "big" network. - - PAPER1: - "a single large neural net with two hidden layers of 1200 rectified linear hidden units" - "the net was strongly regularized using dropout and weight-constraints as described in [5]" - - External ref: - what is a hidden layer? https://medium.com/fintechexplained/what-are-hidden-layers-4f54f7328263 - - PAPER2: - "each hidden unit is randomly omitted from the network with a probability of 0.5" - NOTE This implementation ignores this step: - "All layers had L2 weight constraints on the incoming weights of each hidden unit" - - """ - - super().__init__() - - self.model = Sequential( - # NOTE: don't dropout input data (probably a bad idea). exercise for the reader: why? - Linear(784, 1200), # hidden layer 1 weights - ReLU(), - Dropout(0.5), - Linear(1200, 1200), # hidden layer 2 weights - ReLU(), - Dropout(0.5), - Linear(1200, 10), # output layer weights - ) - - self.critereon = cross_entropy - self.lr = lr - - def forward(self, x): - return self.model(x) - - def training_step(self, batch, batch_idx): - x, y = batch - y_hat = self.forward(x) - loss = self.critereon(y_hat, y.view(-1)) - - return {"loss": loss} - - def validation_step(self, batch, batch_idx): - x, y = batch - y_hat = self.forward(x) - loss = self.critereon(y_hat, y.view(-1)) - - return {"val_loss": loss} - - def configure_optimizers(self): - return Adam(self.parameters(), lr=self.lr) \ No newline at end of file diff --git a/hub_examples/pytorch/knowledge_distillation/learner.py b/hub_examples/pytorch/knowledge_distillation/learner.py new file mode 100644 index 0000000..fc0d0a0 --- /dev/null +++ b/hub_examples/pytorch/knowledge_distillation/learner.py @@ -0,0 +1,40 @@ +import pytorch_lightning as pl + +from torch.optim import Adam +from torch.nn.functional import mse_loss, cross_entropy + + +# PAPER1: https://arxiv.org/pdf/1503.02531.pdf (knowledge distillation, section 3) +# PAPER2: https://arxiv.org/pdf/1207.0580.pdf (preventing co-adaption, constraints) + + +class Learner(pl.LightningModule): + def __init__(self, model, lr=0.01): + """Knowledge distillation learner. The incoming model will + be trained on the image embeddings from the trained `Teacher` model. + """ + + super().__init__() + + self.model = model + self.lr = lr + + def forward(self, x): + return self.model(x) + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.forward(x) + loss = mse_loss(y_hat, y) + + return {"loss": loss} + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self.forward(x) + loss = cross_entropy(y_hat, y.view(-1)) + + return {"val_loss": loss} + + def configure_optimizers(self): + return Adam(self.parameters(), lr=self.lr) \ No newline at end of file diff --git a/hub_examples/pytorch/knowledge_distillation/models.py b/hub_examples/pytorch/knowledge_distillation/models.py new file mode 100644 index 0000000..7856aa8 --- /dev/null +++ b/hub_examples/pytorch/knowledge_distillation/models.py @@ -0,0 +1,47 @@ +from torch.nn import Sequential, Linear, ReLU, Dropout + + +# PAPER1: https://arxiv.org/pdf/1503.02531.pdf (knowledge distillation, section 3) +# PAPER2: https://arxiv.org/pdf/1207.0580.pdf (preventing co-adaption, constraints) + + +def get_big_net(): + """Defining the first "big" network. + + PAPER1: + "a single large neural net with two hidden layers of 1200 rectified linear hidden units" + "the net was strongly regularized using dropout and weight-constraints as described in [5]" + + External ref: + what is a hidden layer? https://medium.com/fintechexplained/what-are-hidden-layers-4f54f7328263 + + PAPER2: + "each hidden unit is randomly omitted from the network with a probability of 0.5" + NOTE This implementation ignores this step: + "All layers had L2 weight constraints on the incoming weights of each hidden unit" + """ + + return Sequential( + # NOTE: don't dropout input data (probably a bad idea). exercise for the reader: why? + Linear(784, 1200), # hidden layer 1 weights + ReLU(), + Dropout(0.5), + Linear(1200, 1200), # hidden layer 2 weights + ReLU(), + Dropout(0.5), + Linear(1200, 10), # output layer weights + ) + + + +def get_small_net(): + # PAPER1: "a smaller net with two hidden layers of 800 rectified linear hidden units and no regularization" + # NOTE: btw, both of these networks were generated first try by github copilot...... :_) + + return Sequential( + Linear(784, 800), # hidden layer 1 weights + ReLU(), + Linear(800, 800), # hidden layer 2 weights + ReLU(), + Linear(800, 10), # output layer weights + ) \ No newline at end of file diff --git a/hub_examples/pytorch/knowledge_distillation/teacher.py b/hub_examples/pytorch/knowledge_distillation/teacher.py new file mode 100644 index 0000000..3c06cf3 --- /dev/null +++ b/hub_examples/pytorch/knowledge_distillation/teacher.py @@ -0,0 +1,43 @@ +import pytorch_lightning as pl + +from torch.optim import Adam +from torch.nn.functional import cross_entropy + + +# PAPER1: https://arxiv.org/pdf/1503.02531.pdf (knowledge distillation, section 3) +# PAPER2: https://arxiv.org/pdf/1207.0580.pdf (preventing co-adaption, constraints) + + +class Teacher(pl.LightningModule): + def __init__(self, model, lr=0.01): + """Knowledge distillation teacher. The incoming model will + be trained on the actual images. After it is trained, the + learner model will be trained on the final embeddings of the incoming + model to this class. + """ + + super().__init__() + + self.model = model + self.critereon = cross_entropy + self.lr = lr + + def forward(self, x): + return self.model(x) + + def training_step(self, batch, batch_idx): + x, y = batch + y_hat = self.forward(x) + loss = self.critereon(y_hat, y.view(-1)) + + return {"loss": loss} + + def validation_step(self, batch, batch_idx): + x, y = batch + y_hat = self.forward(x) + loss = self.critereon(y_hat, y.view(-1)) + + return {"val_loss": loss} + + def configure_optimizers(self): + return Adam(self.parameters(), lr=self.lr) \ No newline at end of file diff --git a/hub_examples/pytorch/knowledge_distillation/train.py b/hub_examples/pytorch/knowledge_distillation/train.py index edc8ae5..d7414c5 100644 --- a/hub_examples/pytorch/knowledge_distillation/train.py +++ b/hub_examples/pytorch/knowledge_distillation/train.py @@ -3,16 +3,23 @@ import torch import pytorch_lightning as pl -from big_net import BigNet +from teacher import Teacher +from learner import Learner +from models import get_big_net, get_small_net + + +# PAPER1: https://arxiv.org/pdf/1503.02531.pdf (knowledge distillation, section 3) +# PAPER2: https://arxiv.org/pdf/1207.0580.pdf (preventing co-adaption, constraints) MAX_SAMPLES = 128 TRAIN_URI = "hub://activeloop/mnist-train" TEST_URI = "hub://activeloop/mnist-test" +EMBEDDINGS_URI = "./teacher_embeddings" -def get_train_val_loaders(): +def get_teacher_loaders(): mnist_train = hub.load(TRAIN_URI)[:MAX_SAMPLES] mnist_test = hub.load(TEST_URI)[:MAX_SAMPLES] @@ -29,38 +36,68 @@ def transform(sample): return train, val +def get_learner_loaders(): + mnist_embeddings = hub.load(EMBEDDINGS_URI)[:MAX_SAMPLES] + + def transform(sample): + x = sample["images"] + x = x.float().view(-1) + t = sample["labels"] + t = t.float() + return x, t + + train = mnist_embeddings.pytorch(transform=transform, shuffle=False, batch_size=128, num_workers=4) + + _, val = get_teacher_loaders() + + return train, val + def train_teacher(model: pl.LightningModule, epochs=1): trainer = pl.Trainer(max_epochs=epochs) - train, val = get_train_val_loaders() + train, val = get_teacher_loaders() trainer.fit(model, train, val) -def generate_teacher_embedding_dataset(model: pl.LightningModule, output_dataset_uri: str): +def generate_teacher_embedding_dataset(model): mnist_train = hub.load(TRAIN_URI)[:MAX_SAMPLES] @hub.compute def generate_embeddings(sample_in, sample_out): - x = torch.tensor(sample_in.images.numpy()).view(-1).float() + image = sample_in.images.numpy() + + x = torch.tensor(image).view(-1).float() y = model(x).detach().numpy() - sample_out.images.append(y) + sample_out.images.append(image) + sample_out.labels.append(y) return sample_out - embeddings = hub.empty(output_dataset_uri, overwrite=True) - embeddings.create_tensor("images", dtype=float) + embeddings = hub.empty(EMBEDDINGS_URI, overwrite=True) + embeddings.create_tensor("images", dtype="uint8") + embeddings.create_tensor("labels", dtype=float) generate_embeddings().eval(mnist_train, embeddings, num_workers=0) -def train_learner(): - # TODO: require `train_teacher` to be called first, - # TODO: train the learner model on the new local dataset with the teacher embeddings - raise NotImplementedError +def train_learner(model: pl.LightningModule, epochs=1): + trainer = pl.Trainer(max_epochs=epochs) + train, val = get_learner_loaders() + trainer.fit(model, train, val) if __name__ == "__main__": - big_net = BigNet() - # train_teacher(big_net) - - generate_teacher_embedding_dataset(big_net, "./teacher_embeddings") \ No newline at end of file + # first, we need to train the teacher network + big_net = get_big_net() + train_teacher(Teacher(big_net)) + + # now the teacher network is trained. let's generate a new hub dataset. + # this new dataset doesn't change the `images` tensor (it just copies it) + # but it DOES change the `labels` tensor. instead of the normal mnist labels, + # it uses the embeddings (outputs for each `images` sample) of the teacher model + generate_teacher_embedding_dataset(big_net) + + # finally, we can train the learner network to predict the output embeddings + # of the teacher network. we can do so by using the new output embeddings dataset + small_net = get_small_net() + train_learner(Learner(small_net)) \ No newline at end of file diff --git a/hub_examples/training/knowledge_distillation.py b/hub_examples/training/knowledge_distillation.py deleted file mode 100644 index 3201e1a..0000000 --- a/hub_examples/training/knowledge_distillation.py +++ /dev/null @@ -1,100 +0,0 @@ -import hub -import numpy as np -import torch - -from torch.nn import Sequential, Linear, ReLU, Dropout -from torch.nn.functional import cross_entropy -from torch.optim import SGD - - -# PAPER1: https://arxiv.org/pdf/1503.02531.pdf (knowledge distillation, section 3) -# PAPER2: https://arxiv.org/pdf/1207.0580.pdf (preventing co-adaption, constraints) - -# NOTE: this script implements section 3: "Preliminary experiments on MNIST" from PAPER1 - -"""Defining the first "big" network - -PAPER1: - "a single large neural net with two hidden layers of 1200 rectified linear hidden units" - "the net was strongly regularized using dropout and weight-constraints as described in [5]" - -External ref: - what is a hidden layer? https://medium.com/fintechexplained/what-are-hidden-layers-4f54f7328263 - -PAPER2: - "each hidden unit is randomly omitted from the network with a probability of 0.5" - NOTE This implementation ignores this step: - "All layers had L2 weight constraints on the incoming weights of each hidden unit" - -""" -dp = 0.5 -big_net = Sequential( - # NOTE: don't dropout input data (probably a bad idea). exercise for the reader: why? - Linear(784, 1200), # hidden layer 1 weights - ReLU(), - Dropout(dp), - Linear(1200, 1200), # hidden layer 2 weights - ReLU(), - Dropout(dp), - Linear(1200, 10), # output layer weights -) - -# PAPER1: "a smaller net with two hidden layers of 800 rectified linear hidden units and no regularization" -# NOTE: btw, both of these networks were generated first try by github copilot...... :_) -small_net = Sequential( - Linear(784, 800), # hidden layer 1 weights - ReLU(), - Linear(800, 800), # hidden layer 2 weights - ReLU(), - Linear(800, 10), # output layer weights -) - -# NOTE ignore this step from PAPER1: "in addition, the input images were jittered by up to two pixels in any direction" -def transform(sample): - x = sample["images"].float().view(-1) - t = sample["labels"].long() - return x, t - - -mnist = hub.load("hub://activeloop/mnist-train") - - -# PAPER1: "To see how well distillation works, we trained [... describe nets ...] on all [MNIST] 60,000 training cases" -device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") -def train(net, name: str, epochs=1, batch_size=256, lr=0.01): - net = net.to(device) - - dataloader = mnist.pytorch(transform=transform, batch_size=batch_size) - optim = SGD(net.parameters(), lr=lr) - - # you can use hub to log your metrics! - metrics = hub.empty(f"./metrics_{name}", overwrite=True) - loss_epoch_average = metrics.create_tensor("loss_epoch_average", dtype=float) - - for epoch in range(epochs): - epoch_loss = metrics.create_tensor(f"loss_epoch_{epoch}", dtype=float) - - for x, t in dataloader: - x = x.to(device) - t = t.to(device).view(-1) - - optim.zero_grad() - - # predict - y = net(x) - - # learn - loss = cross_entropy(y, t) - loss.backward() - optim.step() - - # metrics - epoch_loss.append(loss.item()) - mean_loss = np.mean(epoch_loss) - loss_epoch_average.append(mean_loss) - print(f"epoch {epoch} loss: {mean_loss}") - - return metrics - -big_metrics = train(big_net, "big_net", epochs=10) -print(big_metrics.loss_epoch_average.numpy()) From f71b09b18539199daabbca2a529794b0632d54a5 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 11 Oct 2021 15:07:47 -0700 Subject: [PATCH 13/16] use shuffle true --- hub_examples/pytorch/knowledge_distillation/train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hub_examples/pytorch/knowledge_distillation/train.py b/hub_examples/pytorch/knowledge_distillation/train.py index d7414c5..9954cfb 100644 --- a/hub_examples/pytorch/knowledge_distillation/train.py +++ b/hub_examples/pytorch/knowledge_distillation/train.py @@ -30,7 +30,7 @@ def transform(sample): t = t.long() return x, t - train = mnist_train.pytorch(transform=transform, shuffle=False, batch_size=128, num_workers=4) + train = mnist_train.pytorch(transform=transform, shuffle=True, batch_size=128, num_workers=4) val = mnist_test.pytorch(transform=transform, shuffle=False, batch_size=128, num_workers=4) return train, val @@ -95,9 +95,9 @@ def train_learner(model: pl.LightningModule, epochs=1): # this new dataset doesn't change the `images` tensor (it just copies it) # but it DOES change the `labels` tensor. instead of the normal mnist labels, # it uses the embeddings (outputs for each `images` sample) of the teacher model - generate_teacher_embedding_dataset(big_net) + # generate_teacher_embedding_dataset(big_net) # finally, we can train the learner network to predict the output embeddings # of the teacher network. we can do so by using the new output embeddings dataset - small_net = get_small_net() - train_learner(Learner(small_net)) \ No newline at end of file + # small_net = get_small_net() + # train_learner(Learner(small_net)) \ No newline at end of file From 06b62acab771d1eea4144640aa0f2f43390a7ace Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 11 Oct 2021 15:21:16 -0700 Subject: [PATCH 14/16] use shufling and change embeddings dir --- .gitignore | 1 + .../pytorch/knowledge_distillation/train.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index da37e0d..ddb23e0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__/ *$py.class _datasets/* +lightning_logs/* # C extensions *.so diff --git a/hub_examples/pytorch/knowledge_distillation/train.py b/hub_examples/pytorch/knowledge_distillation/train.py index 9954cfb..cab36dd 100644 --- a/hub_examples/pytorch/knowledge_distillation/train.py +++ b/hub_examples/pytorch/knowledge_distillation/train.py @@ -15,7 +15,7 @@ MAX_SAMPLES = 128 TRAIN_URI = "hub://activeloop/mnist-train" TEST_URI = "hub://activeloop/mnist-test" -EMBEDDINGS_URI = "./teacher_embeddings" +EMBEDDINGS_URI = "._datasets/teacher_embeddings" @@ -46,7 +46,7 @@ def transform(sample): t = t.float() return x, t - train = mnist_embeddings.pytorch(transform=transform, shuffle=False, batch_size=128, num_workers=4) + train = mnist_embeddings.pytorch(transform=transform, shuffle=True, batch_size=128, num_workers=4) _, val = get_teacher_loaders() @@ -88,6 +88,7 @@ def train_learner(model: pl.LightningModule, epochs=1): if __name__ == "__main__": # first, we need to train the teacher network + print("\n\nTraining teacher\n\n") big_net = get_big_net() train_teacher(Teacher(big_net)) @@ -95,9 +96,11 @@ def train_learner(model: pl.LightningModule, epochs=1): # this new dataset doesn't change the `images` tensor (it just copies it) # but it DOES change the `labels` tensor. instead of the normal mnist labels, # it uses the embeddings (outputs for each `images` sample) of the teacher model - # generate_teacher_embedding_dataset(big_net) + print("\n\nGenerating embedding dataset\n\n") + generate_teacher_embedding_dataset(big_net) # finally, we can train the learner network to predict the output embeddings # of the teacher network. we can do so by using the new output embeddings dataset - # small_net = get_small_net() - # train_learner(Learner(small_net)) \ No newline at end of file + print("\n\nTraining learner on embedding dataset\n\n") + small_net = get_small_net() + train_learner(Learner(small_net)) \ No newline at end of file From 6f6f8637cb89ad44f96467dfceb5dc3cee0ef17a Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 11 Oct 2021 15:30:47 -0700 Subject: [PATCH 15/16] update lightning module field vars --- .../pytorch/knowledge_distillation/learner.py | 4 ++-- .../pytorch/knowledge_distillation/teacher.py | 12 +++++++----- hub_examples/pytorch/knowledge_distillation/train.py | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/hub_examples/pytorch/knowledge_distillation/learner.py b/hub_examples/pytorch/knowledge_distillation/learner.py index fc0d0a0..6207e5a 100644 --- a/hub_examples/pytorch/knowledge_distillation/learner.py +++ b/hub_examples/pytorch/knowledge_distillation/learner.py @@ -17,7 +17,7 @@ def __init__(self, model, lr=0.01): super().__init__() self.model = model - self.lr = lr + self.hparams.lr = lr def forward(self, x): return self.model(x) @@ -37,4 +37,4 @@ def validation_step(self, batch, batch_idx): return {"val_loss": loss} def configure_optimizers(self): - return Adam(self.parameters(), lr=self.lr) \ No newline at end of file + return Adam(self.parameters(), lr=self.hparams.lr) \ No newline at end of file diff --git a/hub_examples/pytorch/knowledge_distillation/teacher.py b/hub_examples/pytorch/knowledge_distillation/teacher.py index 3c06cf3..0cbe49b 100644 --- a/hub_examples/pytorch/knowledge_distillation/teacher.py +++ b/hub_examples/pytorch/knowledge_distillation/teacher.py @@ -1,3 +1,4 @@ +import hub import pytorch_lightning as pl from torch.optim import Adam @@ -19,8 +20,9 @@ def __init__(self, model, lr=0.01): super().__init__() self.model = model - self.critereon = cross_entropy - self.lr = lr + self.loss = cross_entropy + + self.hparams.lr = lr def forward(self, x): return self.model(x) @@ -28,16 +30,16 @@ def forward(self, x): def training_step(self, batch, batch_idx): x, y = batch y_hat = self.forward(x) - loss = self.critereon(y_hat, y.view(-1)) + loss = self.loss(y_hat, y.view(-1)) return {"loss": loss} def validation_step(self, batch, batch_idx): x, y = batch y_hat = self.forward(x) - loss = self.critereon(y_hat, y.view(-1)) + loss = self.loss(y_hat, y.view(-1)) return {"val_loss": loss} def configure_optimizers(self): - return Adam(self.parameters(), lr=self.lr) \ No newline at end of file + return Adam(self.parameters(), lr=self.hparams.lr) \ No newline at end of file diff --git a/hub_examples/pytorch/knowledge_distillation/train.py b/hub_examples/pytorch/knowledge_distillation/train.py index cab36dd..e5d889f 100644 --- a/hub_examples/pytorch/knowledge_distillation/train.py +++ b/hub_examples/pytorch/knowledge_distillation/train.py @@ -15,7 +15,7 @@ MAX_SAMPLES = 128 TRAIN_URI = "hub://activeloop/mnist-train" TEST_URI = "hub://activeloop/mnist-test" -EMBEDDINGS_URI = "._datasets/teacher_embeddings" +EMBEDDINGS_URI = "./_datasets/teacher_embeddings" From 8bc582b8864ca65b54650e3bb9151d175b121394 Mon Sep 17 00:00:00 2001 From: McCrearyD Date: Mon, 11 Oct 2021 15:54:19 -0700 Subject: [PATCH 16/16] use weights and biases --- .gitignore | 2 ++ .../pytorch/knowledge_distillation/teacher.py | 17 +++++++++++++++-- .../pytorch/knowledge_distillation/train.py | 17 +++++++++-------- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index ddb23e0..7040563 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ __pycache__/ _datasets/* lightning_logs/* +wandb/* +examples-hub_examples* # C extensions *.so diff --git a/hub_examples/pytorch/knowledge_distillation/teacher.py b/hub_examples/pytorch/knowledge_distillation/teacher.py index 0cbe49b..7da2be0 100644 --- a/hub_examples/pytorch/knowledge_distillation/teacher.py +++ b/hub_examples/pytorch/knowledge_distillation/teacher.py @@ -1,14 +1,15 @@ -import hub import pytorch_lightning as pl +import torch from torch.optim import Adam from torch.nn.functional import cross_entropy +from pytorch_lightning.metrics.functional import accuracy + # PAPER1: https://arxiv.org/pdf/1503.02531.pdf (knowledge distillation, section 3) # PAPER2: https://arxiv.org/pdf/1207.0580.pdf (preventing co-adaption, constraints) - class Teacher(pl.LightningModule): def __init__(self, model, lr=0.01): """Knowledge distillation teacher. The incoming model will @@ -29,15 +30,27 @@ def forward(self, x): def training_step(self, batch, batch_idx): x, y = batch + y_hat = self.forward(x) + loss = self.loss(y_hat, y.view(-1)) + acc = accuracy(torch.argmax(y_hat, dim=1), y) + + self.log('train_loss', loss, on_step=True, on_epoch=True, logger=True) + self.log('train_acc', acc, on_step=True, on_epoch=True, logger=True) return {"loss": loss} def validation_step(self, batch, batch_idx): x, y = batch + y_hat = self.forward(x) + loss = self.loss(y_hat, y.view(-1)) + acc = accuracy(torch.argmax(y_hat, dim=1), y) + + self.log('val_loss', loss, prog_bar=True) + self.log('val_acc', acc, prog_bar=True) return {"val_loss": loss} diff --git a/hub_examples/pytorch/knowledge_distillation/train.py b/hub_examples/pytorch/knowledge_distillation/train.py index e5d889f..a62b223 100644 --- a/hub_examples/pytorch/knowledge_distillation/train.py +++ b/hub_examples/pytorch/knowledge_distillation/train.py @@ -2,6 +2,7 @@ import torch import pytorch_lightning as pl +from pytorch_lightning.loggers import WandbLogger from teacher import Teacher from learner import Learner @@ -12,7 +13,7 @@ # PAPER2: https://arxiv.org/pdf/1207.0580.pdf (preventing co-adaption, constraints) -MAX_SAMPLES = 128 +MAX_SAMPLES = 60_000 TRAIN_URI = "hub://activeloop/mnist-train" TEST_URI = "hub://activeloop/mnist-test" EMBEDDINGS_URI = "./_datasets/teacher_embeddings" @@ -53,7 +54,7 @@ def transform(sample): return train, val def train_teacher(model: pl.LightningModule, epochs=1): - trainer = pl.Trainer(max_epochs=epochs) + trainer = pl.Trainer(max_epochs=epochs, logger=WandbLogger()) train, val = get_teacher_loaders() trainer.fit(model, train, val) @@ -81,7 +82,7 @@ def generate_embeddings(sample_in, sample_out): def train_learner(model: pl.LightningModule, epochs=1): - trainer = pl.Trainer(max_epochs=epochs) + trainer = pl.Trainer(max_epochs=epochs, logger=WandbLogger()) train, val = get_learner_loaders() trainer.fit(model, train, val) @@ -96,11 +97,11 @@ def train_learner(model: pl.LightningModule, epochs=1): # this new dataset doesn't change the `images` tensor (it just copies it) # but it DOES change the `labels` tensor. instead of the normal mnist labels, # it uses the embeddings (outputs for each `images` sample) of the teacher model - print("\n\nGenerating embedding dataset\n\n") - generate_teacher_embedding_dataset(big_net) + # print("\n\nGenerating embedding dataset\n\n") + # generate_teacher_embedding_dataset(big_net) # finally, we can train the learner network to predict the output embeddings # of the teacher network. we can do so by using the new output embeddings dataset - print("\n\nTraining learner on embedding dataset\n\n") - small_net = get_small_net() - train_learner(Learner(small_net)) \ No newline at end of file + # print("\n\nTraining learner on embedding dataset\n\n") + # small_net = get_small_net() + # train_learner(Learner(small_net)) \ No newline at end of file