1
1
from bs4 import BeautifulSoup
2
2
import re
3
3
import pymysql
4
+ from urllib .request import urlopen
4
5
5
- conn = pymysql .connect (host = '127.0.0.1' , unix_socket = '/tmp/mysql.sock' , user = 'root' , passwd = None , db = 'mysql' , charset = 'utf8' )
6
+ conn = pymysql .connect (host = '127.0.0.1' , port = 3306 , user = 'root' , passwd = 'root' , db = 'mysql' , charset = 'utf8' )
6
7
cur = conn .cursor ()
7
8
cur .execute ("USE wikipedia" )
8
9
@@ -29,12 +30,12 @@ def getLinks(pageUrl, recursionLevel):
29
30
pageId = insertPageIfNotExists (pageUrl )
30
31
html = urlopen ("http://en.wikipedia.org" + pageUrl )
31
32
bsObj = BeautifulSoup (html )
32
- for link in bsObj .findAll ("a" ,
33
- href = re .compile ("^(/wiki/)((?!:).)*$" )):
34
- insertLink (pageId , insertPageIfNotExists (link .attrs ['href' ]))
33
+ for link in bsObj .findAll ("a" , href = re .compile ("^(/wiki/)((?!:).)*$" )):
34
+ insertLink (pageId , insertPageIfNotExists (link .attrs ['href' ]))
35
35
if link .attrs ['href' ] not in pages :
36
36
#We have encountered a new page, add it and search it for links
37
37
newPage = link .attrs ['href' ]
38
+ print (newPage )
38
39
pages .add (newPage )
39
40
getLinks (newPage , recursionLevel + 1 )
40
41
getLinks ("/wiki/Kevin_Bacon" , 0 )
0 commit comments