Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
amigcamel committed May 13, 2014
0 parents commit 9cc194e
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 0 deletions.
Empty file added README.md
Empty file.
38 changes: 38 additions & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#-*-coding:utf-8-*-
import urllib, urllib2, cookielib, re

def parseTree(string):
if not isinstance(string, unicode):
try:
string = string.decode('utf-8')
except:
raise UnicodeError('Input encoding should be UTF8 of UNICODE')
string = string.encode('cp950')

URL = 'http://parser.iis.sinica.edu.tw/'

cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

opener.addheaders = [
('User-Agent', 'Mozilla/5.0 Gecko/20100101 Firefox/29.0'),
('referer', 'http://parser.iis.sinica.edu.tw/'),
('Host', 'parser.iis.sinica.edu.tw')
]

raw = urllib.urlopen(URL).read()
fid = re.search('name="id" value="(\d+)"', raw).group(1)

postdata = dict()
postdata['myTag'] = string
postdata['id'] = fid

postdata = urllib.urlencode(postdata)

resURL = 'http://parser.iis.sinica.edu.tw/svr/webparser.asp'

res = opener.open(resURL, postdata).read()
res = res.decode('cp950')
res = re.findall('<nobr>#\d+:(.*?)</nobr>', res)

return res

0 comments on commit 9cc194e

Please sign in to comment.