Skip to content

Commit

Permalink
Update notebooks/gcloud-example/github-trend-analysis.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
Tuan Vu committed Dec 30, 2018
1 parent 654082d commit a6e25e2
Showing 1 changed file with 290 additions and 0 deletions.
290 changes: 290 additions & 0 deletions notebooks/gcloud-example/github-trend-analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1061,6 +1061,296 @@
"df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example Final table: GitHub on Hacker News Trends of 2018-12-01"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"WITH github_activity AS (\n",
"SELECT \n",
" repo.name as repo,\n",
" CONCAT('https://github.com/', repo.name) as url,\n",
" SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n",
" SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n",
" COUNT(*) AS cnt\n",
"FROM `githubarchive.day.20181201`\n",
"WHERE type IN ('WatchEvent','ForkEvent')\n",
"GROUP BY 1,2\n",
"),\n",
"hacker_news AS (\n",
"SELECT\n",
" EXTRACT(DATE FROM timestamp) as date,\n",
" `by` AS submitter,\n",
" id as story_id,\n",
" REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n",
" SUM(score) as score\n",
"FROM\n",
" `bigquery-public-data.hacker_news.full`\n",
"WHERE\n",
" type = 'story'\n",
" AND EXTRACT(DATE FROM timestamp)='2018-12-01' \n",
" AND url LIKE '%https://github.com%'\n",
" AND url NOT LIKE '%github.com/blog/%'\n",
"GROUP BY 1,2,3,4\n",
")\n",
"\n",
"SELECT\n",
" a.date as date,\n",
" a.url as github_url,\n",
" b.repo as github_repo,\n",
" a.score as hn_score,\n",
" a.story_id as hn_story_id,\n",
" b.stars as stars,\n",
" b.forks as forks\n",
"FROM hacker_news as a\n",
"LEFT JOIN github_activity as b\n",
"ON a.url=b.url\n",
"ORDER BY hn_score DESC\n",
"LIMIT 10\n",
"\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>date</th>\n",
" <th>github_url</th>\n",
" <th>github_repo</th>\n",
" <th>hn_score</th>\n",
" <th>hn_story_id</th>\n",
" <th>stars</th>\n",
" <th>forks</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2018-12-01</td>\n",
" <td>https://github.com/ithinco/i-am-chinese-the-dr...</td>\n",
" <td>ithinco/i-am-chinese-the-dragonfly-must-go-on</td>\n",
" <td>129</td>\n",
" <td>18574181</td>\n",
" <td>60.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2018-12-01</td>\n",
" <td>https://github.com/YugaByte/yugabyte-db</td>\n",
" <td>YugaByte/yugabyte-db</td>\n",
" <td>115</td>\n",
" <td>18576170</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2018-12-01</td>\n",
" <td>https://github.com/alertlogic/erllambda</td>\n",
" <td>alertlogic/erllambda</td>\n",
" <td>64</td>\n",
" <td>18574683</td>\n",
" <td>48.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2018-12-01</td>\n",
" <td>https://github.com/oxplot/pdftilecut</td>\n",
" <td>oxplot/pdftilecut</td>\n",
" <td>64</td>\n",
" <td>18575094</td>\n",
" <td>91.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2018-12-01</td>\n",
" <td>https://github.com/chocolatey/boxstarter</td>\n",
" <td>chocolatey/boxstarter</td>\n",
" <td>9</td>\n",
" <td>18575802</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2018-12-01</td>\n",
" <td>https://github.com/devsnek/engine262</td>\n",
" <td>devsnek/engine262</td>\n",
" <td>8</td>\n",
" <td>18577658</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>2018-12-01</td>\n",
" <td>https://github.com/andrewchaa/functional.pipe</td>\n",
" <td>andrewchaa/functional.pipe</td>\n",
" <td>4</td>\n",
" <td>18574107</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>2018-12-01</td>\n",
" <td>https://github.com/anmonteiro/aws-lambda-ocaml...</td>\n",
" <td>anmonteiro/aws-lambda-ocaml-runtime</td>\n",
" <td>4</td>\n",
" <td>18578964</td>\n",
" <td>5.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>2018-12-01</td>\n",
" <td>https://github.com/KumarAbhirup/bulk-mail-cli</td>\n",
" <td>None</td>\n",
" <td>4</td>\n",
" <td>18577887</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>2018-12-01</td>\n",
" <td>https://github.com/jerverless/jerverless</td>\n",
" <td>None</td>\n",
" <td>4</td>\n",
" <td>18577036</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" date github_url \\\n",
"0 2018-12-01 https://github.com/ithinco/i-am-chinese-the-dr... \n",
"1 2018-12-01 https://github.com/YugaByte/yugabyte-db \n",
"2 2018-12-01 https://github.com/alertlogic/erllambda \n",
"3 2018-12-01 https://github.com/oxplot/pdftilecut \n",
"4 2018-12-01 https://github.com/chocolatey/boxstarter \n",
"5 2018-12-01 https://github.com/devsnek/engine262 \n",
"6 2018-12-01 https://github.com/andrewchaa/functional.pipe \n",
"7 2018-12-01 https://github.com/anmonteiro/aws-lambda-ocaml... \n",
"8 2018-12-01 https://github.com/KumarAbhirup/bulk-mail-cli \n",
"9 2018-12-01 https://github.com/jerverless/jerverless \n",
"\n",
" github_repo hn_score hn_story_id \\\n",
"0 ithinco/i-am-chinese-the-dragonfly-must-go-on 129 18574181 \n",
"1 YugaByte/yugabyte-db 115 18576170 \n",
"2 alertlogic/erllambda 64 18574683 \n",
"3 oxplot/pdftilecut 64 18575094 \n",
"4 chocolatey/boxstarter 9 18575802 \n",
"5 devsnek/engine262 8 18577658 \n",
"6 andrewchaa/functional.pipe 4 18574107 \n",
"7 anmonteiro/aws-lambda-ocaml-runtime 4 18578964 \n",
"8 None 4 18577887 \n",
"9 None 4 18577036 \n",
"\n",
" stars forks \n",
"0 60.0 1.0 \n",
"1 2.0 NaN \n",
"2 48.0 NaN \n",
"3 91.0 NaN \n",
"4 1.0 NaN \n",
"5 1.0 NaN \n",
"6 2.0 NaN \n",
"7 5.0 NaN \n",
"8 NaN NaN \n",
"9 NaN NaN "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"\"\"\n",
"WITH github_activity AS (\n",
"SELECT \n",
" repo.name as repo,\n",
" CONCAT('https://github.com/', repo.name) as url,\n",
" SUM(IF(type='WatchEvent', 1, NULL)) AS stars,\n",
" SUM(IF(type='ForkEvent', 1, NULL)) AS forks,\n",
" COUNT(*) AS cnt\n",
"FROM `githubarchive.day.{0}`\n",
"WHERE type IN ('WatchEvent','ForkEvent')\n",
"GROUP BY 1,2\n",
"),\n",
"hacker_news AS (\n",
"SELECT\n",
" EXTRACT(DATE FROM timestamp) as date,\n",
" `by` AS submitter,\n",
" id as story_id,\n",
" REGEXP_EXTRACT(url, \"(https?://github.com/[^/]*/[^/#?]*)\") as url,\n",
" SUM(score) as score\n",
"FROM\n",
" `bigquery-public-data.hacker_news.full`\n",
"WHERE\n",
" type = 'story'\n",
" AND EXTRACT(DATE FROM timestamp)='{1}' \n",
" AND url LIKE '%https://github.com%'\n",
" AND url NOT LIKE '%github.com/blog/%'\n",
"GROUP BY 1,2,3,4\n",
")\n",
"\n",
"SELECT\n",
" a.date as date,\n",
" a.url as github_url,\n",
" b.repo as github_repo,\n",
" a.score as hn_score,\n",
" a.story_id as hn_story_id,\n",
" b.stars as stars,\n",
" b.forks as forks\n",
"FROM hacker_news as a\n",
"LEFT JOIN github_activity as b\n",
"ON a.url=b.url\n",
"ORDER BY hn_score DESC\n",
"LIMIT 10\n",
"\"\"\".format(process_date_nodash, process_date)\n",
"\n",
"print (query)\n",
"\n",
"df = pd.read_gbq(query, project_id=project_id, dialect='standard')\n",
"df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down

0 comments on commit a6e25e2

Please sign in to comment.