diff --git a/groupby.ipynb b/groupby.ipynb
index bcc13ae..f4c124c 100644
--- a/groupby.ipynb
+++ b/groupby.ipynb
@@ -12,6 +12,980 @@
"import numpy as np"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 分组计算\n",
+ "\n",
+ "分组计算三步曲:拆分 -> 应用 -> 合并\n",
+ "\n",
+ "* 拆分:根据什么进行分组?\n",
+ "* 应用:每个分组进行什么样的计算?\n",
+ "* 合并:把每个分组的计算结果合并起来。\n",
+ "\n",
+ "\n",
+ "![groupby](groupby.png)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " data1 | \n",
+ " data2 | \n",
+ " key1 | \n",
+ " key2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " a | \n",
+ " one | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " a | \n",
+ " two | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 7 | \n",
+ " b | \n",
+ " one | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 6 | \n",
+ " b | \n",
+ " two | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " a | \n",
+ " one | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " data1 data2 key1 key2\n",
+ "0 6 1 a one\n",
+ "1 2 8 a two\n",
+ "2 4 7 b one\n",
+ "3 4 6 b two\n",
+ "4 1 7 a one"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],\n",
+ " 'key2': ['one', 'two', 'one', 'two', 'one'],\n",
+ " 'data1': np.random.randint(1, 10, 5),\n",
+ " 'data2': np.random.randint(1, 10, 5)})\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 对 Series 进行分组\n",
+ "\n",
+ "通过索引对齐关联起来"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "grouped = df['data1'].groupby(df['key1'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "key1\n",
+ "a 3\n",
+ "b 4\n",
+ "Name: data1, dtype: int32"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped.mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "key1 key2\n",
+ "a one 3.5\n",
+ " two 2.0\n",
+ "b one 4.0\n",
+ " two 4.0\n",
+ "Name: data1, dtype: float64"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['data1'].groupby([df['key1'], df['key2']]).mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 对 DataFrame 进行分组"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " data1 | \n",
+ " data2 | \n",
+ "
\n",
+ " \n",
+ " key1 | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " a | \n",
+ " 3 | \n",
+ " 5.333333 | \n",
+ "
\n",
+ " \n",
+ " b | \n",
+ " 4 | \n",
+ " 6.500000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " data1 data2\n",
+ "key1 \n",
+ "a 3 5.333333\n",
+ "b 4 6.500000"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby('key1').mean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "key1 key2\n",
+ "a one 3.5\n",
+ " two 2.0\n",
+ "b one 4.0\n",
+ " two 4.0\n",
+ "Name: data1, dtype: float64"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "means = df.groupby(['key1', 'key2']).mean()['data1']\n",
+ "means"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " key2 | \n",
+ " one | \n",
+ " two | \n",
+ "
\n",
+ " \n",
+ " key1 | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " a | \n",
+ " 3.5 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " b | \n",
+ " 4.0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "key2 one two\n",
+ "key1 \n",
+ "a 3.5 2\n",
+ "b 4.0 4"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "means.unstack()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "key1 key2\n",
+ "a one 3.5\n",
+ " two 2.0\n",
+ "b one 4.0\n",
+ " two 4.0\n",
+ "Name: data1, dtype: float64"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby(['key1', 'key2'])['data1'].mean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 每个分组的元素个数"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "key1 key2\n",
+ "a one 2\n",
+ " two 1\n",
+ "b one 1\n",
+ " two 1\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.groupby(['key1', 'key2']).size()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 对分组进行迭代"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "a\n",
+ " data1 data2 key1 key2\n",
+ "0 6 1 a one\n",
+ "1 2 8 a two\n",
+ "4 1 7 a one\n",
+ "b\n",
+ " data1 data2 key1 key2\n",
+ "2 4 7 b one\n",
+ "3 4 6 b two\n"
+ ]
+ }
+ ],
+ "source": [
+ "for name, group in df.groupby('key1'):\n",
+ " print name\n",
+ " print group\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "('a', 'one')\n",
+ " data1 data2 key1 key2\n",
+ "0 6 1 a one\n",
+ "4 1 7 a one\n",
+ "('a', 'two')\n",
+ " data1 data2 key1 key2\n",
+ "1 2 8 a two\n",
+ "('b', 'one')\n",
+ " data1 data2 key1 key2\n",
+ "2 4 7 b one\n",
+ "('b', 'two')\n",
+ " data1 data2 key1 key2\n",
+ "3 4 6 b two\n"
+ ]
+ }
+ ],
+ "source": [
+ "for name, group in df.groupby(['key1', 'key2']):\n",
+ " print name\n",
+ " print group"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 转化为字典"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'a': data1 data2 key1 key2\n",
+ " 0 6 1 a one\n",
+ " 1 2 8 a two\n",
+ " 4 1 7 a one, 'b': data1 data2 key1 key2\n",
+ " 2 4 7 b one\n",
+ " 3 4 6 b two}"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "d = dict(list(df.groupby('key1')))\n",
+ "d"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " data1 | \n",
+ " data2 | \n",
+ " key1 | \n",
+ " key2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " a | \n",
+ " one | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 8 | \n",
+ " a | \n",
+ " two | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " a | \n",
+ " one | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " data1 data2 key1 key2\n",
+ "0 6 1 a one\n",
+ "1 2 8 a two\n",
+ "4 1 7 a one"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "d['a']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 按列分组"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "data1 int32\n",
+ "data2 int32\n",
+ "key1 object\n",
+ "key2 object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{dtype('int32'): data1 data2\n",
+ " 0 6 1\n",
+ " 1 2 8\n",
+ " 2 4 7\n",
+ " 3 4 6\n",
+ " 4 1 7, dtype('O'): key1 key2\n",
+ " 0 a one\n",
+ " 1 a two\n",
+ " 2 b one\n",
+ " 3 b two\n",
+ " 4 a one}"
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped = df.groupby(df.dtypes, axis=1)\n",
+ "dict(list(grouped))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 通过字典进行分组"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ " d | \n",
+ " e | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Alice | \n",
+ " 8 | \n",
+ " 5 | \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " Bob | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 6 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " Candy | \n",
+ " 3 | \n",
+ " 9 | \n",
+ " 7 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " Dark | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 3 | \n",
+ " 6 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " Emily | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c d e\n",
+ "Alice 8 5 3 7 3\n",
+ "Bob 2 3 3 6 9\n",
+ "Candy 3 9 7 9 1\n",
+ "Dark 1 8 3 6 4\n",
+ "Emily 2 3 1 1 3"
+ ]
+ },
+ "execution_count": 83,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.DataFrame(np.random.randint(1, 10, (5, 5)), \n",
+ " columns=['a', 'b', 'c', 'd', 'e'], \n",
+ " index=['Alice', 'Bob', 'Candy', 'Dark', 'Emily'])\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a | \n",
+ " b | \n",
+ " c | \n",
+ " d | \n",
+ " e | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Alice | \n",
+ " 8 | \n",
+ " 5 | \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " Bob | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 6 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " Candy | \n",
+ " 3 | \n",
+ " 9 | \n",
+ " 7 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " Dark | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 3 | \n",
+ " 6 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " Emily | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " a b c d e\n",
+ "Alice 8 5 3 7 3\n",
+ "Bob 2 NaN NaN 6 9\n",
+ "Candy 3 9 7 9 1\n",
+ "Dark 1 8 3 6 4\n",
+ "Emily 2 3 1 1 3"
+ ]
+ },
+ "execution_count": 84,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.ix[1, 1:3] = np.NaN\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'orange', 'e': 'blue'}\n",
+ "grouped = df.groupby(mapping, axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " blue | \n",
+ " orange | \n",
+ " red | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Alice | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " Bob | \n",
+ " 9 | \n",
+ " 6 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " Candy | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " Dark | \n",
+ " 7 | \n",
+ " 6 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " Emily | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " blue orange red\n",
+ "Alice 6 7 13\n",
+ "Bob 9 6 2\n",
+ "Candy 8 9 12\n",
+ "Dark 7 6 9\n",
+ "Emily 4 1 5"
+ ]
+ },
+ "execution_count": 88,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped.sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " blue | \n",
+ " orange | \n",
+ " red | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Alice | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " Bob | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " Candy | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " Dark | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " Emily | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " blue orange red\n",
+ "Alice 2 1 2\n",
+ "Bob 1 1 1\n",
+ "Candy 2 1 2\n",
+ "Dark 2 1 2\n",
+ "Emily 2 1 2"
+ ]
+ },
+ "execution_count": 89,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped.count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "blue 2\n",
+ "orange 1\n",
+ "red 2\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grouped.size()"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,