diff --git a/groupby.ipynb b/groupby.ipynb index bcc13ae..f4c124c 100644 --- a/groupby.ipynb +++ b/groupby.ipynb @@ -12,6 +12,980 @@ "import numpy as np" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 分组计算\n", + "\n", + "分组计算三步曲:拆分 -> 应用 -> 合并\n", + "\n", + "* 拆分:根据什么进行分组?\n", + "* 应用:每个分组进行什么样的计算?\n", + "* 合并:把每个分组的计算结果合并起来。\n", + "\n", + "\n", + "![groupby](groupby.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
data1data2key1key2
061aone
128atwo
247bone
346btwo
417aone
\n", + "
" + ], + "text/plain": [ + " data1 data2 key1 key2\n", + "0 6 1 a one\n", + "1 2 8 a two\n", + "2 4 7 b one\n", + "3 4 6 b two\n", + "4 1 7 a one" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],\n", + " 'key2': ['one', 'two', 'one', 'two', 'one'],\n", + " 'data1': np.random.randint(1, 10, 5),\n", + " 'data2': np.random.randint(1, 10, 5)})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 对 Series 进行分组\n", + "\n", + "通过索引对齐关联起来" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped = df['data1'].groupby(df['key1'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "key1\n", + "a 3\n", + "b 4\n", + "Name: data1, dtype: int32" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "key1 key2\n", + "a one 3.5\n", + " two 2.0\n", + "b one 4.0\n", + " two 4.0\n", + "Name: data1, dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['data1'].groupby([df['key1'], df['key2']]).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 对 DataFrame 进行分组" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
data1data2
key1
a35.333333
b46.500000
\n", + "
" + ], + "text/plain": [ + " data1 data2\n", + "key1 \n", + "a 3 5.333333\n", + "b 4 6.500000" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby('key1').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "key1 key2\n", + "a one 3.5\n", + " two 2.0\n", + "b one 4.0\n", + " two 4.0\n", + "Name: data1, dtype: float64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "means = df.groupby(['key1', 'key2']).mean()['data1']\n", + "means" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
key2onetwo
key1
a3.52
b4.04
\n", + "
" + ], + "text/plain": [ + "key2 one two\n", + "key1 \n", + "a 3.5 2\n", + "b 4.0 4" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "means.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "key1 key2\n", + "a one 3.5\n", + " two 2.0\n", + "b one 4.0\n", + " two 4.0\n", + "Name: data1, dtype: float64" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby(['key1', 'key2'])['data1'].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 每个分组的元素个数" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "key1 key2\n", + "a one 2\n", + " two 1\n", + "b one 1\n", + " two 1\n", + "dtype: int64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.groupby(['key1', 'key2']).size()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 对分组进行迭代" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a\n", + " data1 data2 key1 key2\n", + "0 6 1 a one\n", + "1 2 8 a two\n", + "4 1 7 a one\n", + "b\n", + " data1 data2 key1 key2\n", + "2 4 7 b one\n", + "3 4 6 b two\n" + ] + } + ], + "source": [ + "for name, group in df.groupby('key1'):\n", + " print name\n", + " print group\n" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('a', 'one')\n", + " data1 data2 key1 key2\n", + "0 6 1 a one\n", + "4 1 7 a one\n", + "('a', 'two')\n", + " data1 data2 key1 key2\n", + "1 2 8 a two\n", + "('b', 'one')\n", + " data1 data2 key1 key2\n", + "2 4 7 b one\n", + "('b', 'two')\n", + " data1 data2 key1 key2\n", + "3 4 6 b two\n" + ] + } + ], + "source": [ + "for name, group in df.groupby(['key1', 'key2']):\n", + " print name\n", + " print group" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 转化为字典" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'a': data1 data2 key1 key2\n", + " 0 6 1 a one\n", + " 1 2 8 a two\n", + " 4 1 7 a one, 'b': data1 data2 key1 key2\n", + " 2 4 7 b one\n", + " 3 4 6 b two}" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d = dict(list(df.groupby('key1')))\n", + "d" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
data1data2key1key2
061aone
128atwo
417aone
\n", + "
" + ], + "text/plain": [ + " data1 data2 key1 key2\n", + "0 6 1 a one\n", + "1 2 8 a two\n", + "4 1 7 a one" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d['a']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 按列分组" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "data1 int32\n", + "data2 int32\n", + "key1 object\n", + "key2 object\n", + "dtype: object" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{dtype('int32'): data1 data2\n", + " 0 6 1\n", + " 1 2 8\n", + " 2 4 7\n", + " 3 4 6\n", + " 4 1 7, dtype('O'): key1 key2\n", + " 0 a one\n", + " 1 a two\n", + " 2 b one\n", + " 3 b two\n", + " 4 a one}" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped = df.groupby(df.dtypes, axis=1)\n", + "dict(list(grouped))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 通过字典进行分组" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcde
Alice85373
Bob23369
Candy39791
Dark18364
Emily23113
\n", + "
" + ], + "text/plain": [ + " a b c d e\n", + "Alice 8 5 3 7 3\n", + "Bob 2 3 3 6 9\n", + "Candy 3 9 7 9 1\n", + "Dark 1 8 3 6 4\n", + "Emily 2 3 1 1 3" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), \n", + " columns=['a', 'b', 'c', 'd', 'e'], \n", + " index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
abcde
Alice85373
Bob2NaNNaN69
Candy39791
Dark18364
Emily23113
\n", + "
" + ], + "text/plain": [ + " a b c d e\n", + "Alice 8 5 3 7 3\n", + "Bob 2 NaN NaN 6 9\n", + "Candy 3 9 7 9 1\n", + "Dark 1 8 3 6 4\n", + "Emily 2 3 1 1 3" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.ix[1, 1:3] = np.NaN\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'orange', 'e': 'blue'}\n", + "grouped = df.groupby(mapping, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
blueorangered
Alice6713
Bob962
Candy8912
Dark769
Emily415
\n", + "
" + ], + "text/plain": [ + " blue orange red\n", + "Alice 6 7 13\n", + "Bob 9 6 2\n", + "Candy 8 9 12\n", + "Dark 7 6 9\n", + "Emily 4 1 5" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
blueorangered
Alice212
Bob111
Candy212
Dark212
Emily212
\n", + "
" + ], + "text/plain": [ + " blue orange red\n", + "Alice 2 1 2\n", + "Bob 1 1 1\n", + "Candy 2 1 2\n", + "Dark 2 1 2\n", + "Emily 2 1 2" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "blue 2\n", + "orange 1\n", + "red 2\n", + "dtype: int64" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped.size()" + ] + }, { "cell_type": "code", "execution_count": null,