forked from patrick-239/amazon-forecast-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request aws-samples#168 from Yatrie/master
Clustering Notebooks
- Loading branch information
Showing
7 changed files
with
607 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
207 changes: 207 additions & 0 deletions
207
...ooks/advanced/Clustering_Preprocessing/01. Optional - Data Cleaning and Preparation.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## (Optional) Data Cleaning and Preparation\n", | ||
"\n", | ||
"This notebook pulls the UCI Online Retail II Data Set and cleans / preprocesses it for use with clustering algorithms.\n", | ||
"\n", | ||
"Data Source: https://archive.ics.uci.edu/ml/datasets/Online+Retail+II\n", | ||
"\n", | ||
"File Location: https://archive.ics.uci.edu/ml/machine-learning-databases/00502/" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"from urllib.request import urlretrieve\n", | ||
"\n", | ||
"import warnings\n", | ||
"warnings.filterwarnings(\"ignore\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# pull data file from source\n", | ||
"dir_path = './data'\n", | ||
"\n", | ||
"if not os.path.exists(dir_path):\n", | ||
" os.makedirs(dir_path)\n", | ||
" \n", | ||
"urlretrieve(\"https://archive.ics.uci.edu/ml/machine-learning-databases/00502/online_retail_II.xlsx\",\n", | ||
" f\"{dir_path}/online_retail_II.xlsx\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%pip install openpyxl" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# read the raw xlsx data\n", | ||
"df_raw = pd.read_excel('./data/online_retail_II.xlsx',\n", | ||
" engine='openpyxl')\n", | ||
"\n", | ||
"print(df_raw.shape, df_raw.columns)\n", | ||
"\n", | ||
"df_raw.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# subset data needed for analysis and roll up to daily frequency\n", | ||
"df_clean = df_raw[['StockCode', 'InvoiceDate', 'Quantity']]\n", | ||
"\n", | ||
"df_clean['timestamp'] = df_clean['InvoiceDate'].dt.date\n", | ||
"\n", | ||
"df_clean = (df_clean\n", | ||
" .groupby(['StockCode', 'timestamp'])['Quantity']\n", | ||
" .agg('sum')\n", | ||
" .reset_index())\n", | ||
"\n", | ||
"print(df_clean.shape, df_clean.columns, df_clean.dtypes)\n", | ||
"\n", | ||
"df_clean.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# prepare data for resampling of time series and fill missing values\n", | ||
"df_pivot = df_clean.pivot(index='timestamp',\n", | ||
" columns='StockCode',\n", | ||
" values='Quantity')\n", | ||
"\n", | ||
"print(df_pivot.shape, df_pivot.columns)\n", | ||
"\n", | ||
"df_pivot.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# columns with unusual stock code data\n", | ||
"print(list(df_pivot.columns)[-75:])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# drop columns with unusual stock code data\n", | ||
"drop_cols = ['ADJUST', 'ADJUST2', 'AMAZONFEE', 'B', 'BANK CHARGES', 'C2', 'C3', 'D', \n", | ||
" 'DOT', 'GIFT', 'M', 'PADS', 'POST', 'S', 'SP1002', 'TEST001', 'TEST002',\n", | ||
" 'gift_0001_10', 'gift_0001_20', 'gift_0001_30', 'gift_0001_40', 'gift_0001_50',\n", | ||
" 'gift_0001_60', 'gift_0001_70', 'gift_0001_80', 'gift_0001_90', 'm']\n", | ||
"\n", | ||
"df_pivot.drop(columns=drop_cols, inplace=True)\n", | ||
"\n", | ||
"print(df_pivot.shape, df_pivot.columns)\n", | ||
"\n", | ||
"df_pivot.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# resample time series data and fill missing values with 0s\n", | ||
"df_pivot.index = pd.DatetimeIndex(df_pivot.index)\n", | ||
"\n", | ||
"df_pivot = df_pivot.resample('D').sum().fillna(0)\n", | ||
"\n", | ||
"print(df_pivot.shape, df_pivot.columns)\n", | ||
"\n", | ||
"df_pivot.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# transpose data to match format neeed for further processing\n", | ||
"df_final = df_pivot.T\n", | ||
"df_final = df_final.reset_index()\n", | ||
"\n", | ||
"print(df_final.shape, df_final.columns)\n", | ||
"\n", | ||
"df_final.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# back up data -> used for clustering and Forecast training in later notebooks\n", | ||
"df_final.to_csv('./data/df_pivoted.csv.zip', index=None)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"### End of processing" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.