Merge pull request aws-samples#168 from Yatrie/master

Clustering Notebooks
tagekezo · Jul 23, 2021 · 0af8bdd · 0af8bdd
2 parents 0d7a21b + 0680f53
commit 0af8bdd
Show file tree

Hide file tree

Showing 7 changed files with 607 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,5 @@ __pycache__
 /notebooks/advanced/WhatIf_Analysis/data/rts1.csv
 /notebooks/advanced/WhatIf_Analysis/data/rts2.csv
 /notebooks/advanced/WhatIf_Analysis/data/tts.csv
+/notebooks/advanced/Clustering_Preprocessing/data/*
+/notebooks/advanced/Clustering_Preprocessing/tsl/*
diff --git a/README.md b/README.md
@@ -47,7 +47,9 @@ Here you will find examples how to use Amazon Forecast Python SDK to make API ca
   - [import data containing 1 single item](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/basic/Tutorial/1.Importing_Your_Data.ipynb)
   - [train a predictor and forecast on a single item](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/basic/Tutorial/2.Building_Your_Predictor.ipynb)
   - [query and evaluate one item at a time](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/basic/Tutorial/3.Evaluating_Your_Predictor.ipynb)
+
 - Advanced folder contains notebooks to show API calls for more complex tasks:
+  - [Time Series Clustering Preprocessing](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/advanced/Clustering_Preprocessing/README.md)
   - [Using the Amazon Forecast Weather Index](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/advanced/Weather_index/1.%20Training%20your%20model%20with%20Weather%20Index.ipynb)
   - [Incorporating Related data](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/advanced/Incorporating_Related_Time_Series_dataset_to_your_Predictor/Incorporating_Related_Time_Series_dataset_to_your_Predictor.ipynb) 
   - [Incorporating Item Meta data](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/advanced/Incorporating_Item_Metadata_Dataset_to_your_Predictor/Incorporating_Item_Metadata_Dataset_to_your_Predictor.ipynb) 

diff --git a/notebooks/README.md b/notebooks/README.md
@@ -10,6 +10,7 @@ For detailed specifics of any concept mentioned look at the [Forecast developer
   - [query and evaluate one item at a time](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/basic/Tutorial/3.Evaluating_Your_Predictor.ipynb)
 
 - Advanced folder contains notebooks to show API calls for more complex tasks:
+  - [Time Series Clustering Preprocessing](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/advanced/Clustering_Preprocessing/README.md)
   - [Using the Amazon Forecast Weather Index](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/advanced/Weather_index/1.%20Training%20your%20model%20with%20Weather%20Index.ipynb)
   - [Incorporating Related data](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/advanced/Incorporating_Related_Time_Series_dataset_to_your_Predictor/Incorporating_Related_Time_Series_dataset_to_your_Predictor.ipynb) 
   - [Incorporating Item Meta data](https://github.com/aws-samples/amazon-forecast-samples/blob/master/notebooks/advanced/Incorporating_Item_Metadata_Dataset_to_your_Predictor/Incorporating_Item_Metadata_Dataset_to_your_Predictor.ipynb) 

diff --git a/...ooks/advanced/Clustering_Preprocessing/01. Optional - Data Cleaning and Preparation.ipynb b/...ooks/advanced/Clustering_Preprocessing/01. Optional - Data Cleaning and Preparation.ipynb
@@ -0,0 +1,207 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## (Optional) Data Cleaning and Preparation\n",
+    "\n",
+    "This notebook pulls the UCI Online Retail II Data Set and cleans / preprocesses it for use with clustering algorithms.\n",
+    "\n",
+    "Data Source: https://archive.ics.uci.edu/ml/datasets/Online+Retail+II\n",
+    "\n",
+    "File Location: https://archive.ics.uci.edu/ml/machine-learning-databases/00502/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from urllib.request import urlretrieve\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pull data file from source\n",
+    "dir_path = './data'\n",
+    "\n",
+    "if not os.path.exists(dir_path):\n",
+    "    os.makedirs(dir_path)\n",
+    "    \n",
+    "urlretrieve(\"https://archive.ics.uci.edu/ml/machine-learning-databases/00502/online_retail_II.xlsx\",\n",
+    "            f\"{dir_path}/online_retail_II.xlsx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install openpyxl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read the raw xlsx data\n",
+    "df_raw = pd.read_excel('./data/online_retail_II.xlsx',\n",
+    "                       engine='openpyxl')\n",
+    "\n",
+    "print(df_raw.shape, df_raw.columns)\n",
+    "\n",
+    "df_raw.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# subset data needed for analysis and roll up to daily frequency\n",
+    "df_clean = df_raw[['StockCode', 'InvoiceDate', 'Quantity']]\n",
+    "\n",
+    "df_clean['timestamp'] = df_clean['InvoiceDate'].dt.date\n",
+    "\n",
+    "df_clean = (df_clean\n",
+    "            .groupby(['StockCode', 'timestamp'])['Quantity']\n",
+    "            .agg('sum')\n",
+    "            .reset_index())\n",
+    "\n",
+    "print(df_clean.shape, df_clean.columns, df_clean.dtypes)\n",
+    "\n",
+    "df_clean.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# prepare data for resampling of time series and fill missing values\n",
+    "df_pivot = df_clean.pivot(index='timestamp',\n",
+    "                          columns='StockCode',\n",
+    "                          values='Quantity')\n",
+    "\n",
+    "print(df_pivot.shape, df_pivot.columns)\n",
+    "\n",
+    "df_pivot.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# columns with unusual stock code data\n",
+    "print(list(df_pivot.columns)[-75:])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop columns with unusual stock code data\n",
+    "drop_cols = ['ADJUST', 'ADJUST2', 'AMAZONFEE', 'B', 'BANK CHARGES', 'C2', 'C3', 'D', \n",
+    "             'DOT', 'GIFT', 'M', 'PADS', 'POST', 'S', 'SP1002', 'TEST001', 'TEST002',\n",
+    "             'gift_0001_10', 'gift_0001_20', 'gift_0001_30', 'gift_0001_40', 'gift_0001_50',\n",
+    "             'gift_0001_60', 'gift_0001_70', 'gift_0001_80', 'gift_0001_90', 'm']\n",
+    "\n",
+    "df_pivot.drop(columns=drop_cols, inplace=True)\n",
+    "\n",
+    "print(df_pivot.shape, df_pivot.columns)\n",
+    "\n",
+    "df_pivot.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# resample time series data and fill missing values with 0s\n",
+    "df_pivot.index = pd.DatetimeIndex(df_pivot.index)\n",
+    "\n",
+    "df_pivot = df_pivot.resample('D').sum().fillna(0)\n",
+    "\n",
+    "print(df_pivot.shape, df_pivot.columns)\n",
+    "\n",
+    "df_pivot.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# transpose data to match format neeed for further processing\n",
+    "df_final = df_pivot.T\n",
+    "df_final = df_final.reset_index()\n",
+    "\n",
+    "print(df_final.shape, df_final.columns)\n",
+    "\n",
+    "df_final.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# back up data -> used for clustering and Forecast training in later notebooks\n",
+    "df_final.to_csv('./data/df_pivoted.csv.zip', index=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### End of processing"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}