Skip to content

Commit

Permalink
Downloadable
Browse files Browse the repository at this point in the history
  • Loading branch information
zxytim committed May 10, 2023
1 parent ba76fcc commit fad89a3
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 18 deletions.
4 changes: 2 additions & 2 deletions manage/dump-db.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ def dump(pid, output):
os.mkdir(OUTPUT)
except:
pass
fout = open(os.path.join(OUTPUT, title + '.pdf'), 'w')
fout = open(os.path.join(OUTPUT, title + '.pdf'), 'wb')
fout.write(pdf)
fout.close()

npage = doc.get('page')
if npage:
for i in range(npage + 1):
fout = open(os.path.join(OUTPUT, title + '.html.{0}'.format(i)), 'w')
fout = open(os.path.join(OUTPUT, title + '.html.{0}'.format(i)), 'wb')
fout.write(doc['html'][i])
fout.close()

Expand Down
4 changes: 2 additions & 2 deletions pdf-compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ def get_args():
def main():
global args
args = get_args()
data = open(args.file).read()
data = open(args.file, 'rb').read()
newdata = pdf_compress(data)

if len(newdata) < len(data):
newfilename = args.file + '.compressed'
with open(newfilename, 'w') as fout:
with open(newfilename, 'wb') as fout:
fout.write(newdata)
os.remove(args.file)
os.rename(newfilename, args.file)
Expand Down
2 changes: 1 addition & 1 deletion sopaper/lib/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def wget_download(url, progress_updater, headers=None):
os.remove(tf.name)
raise FileCorrupted("wget failed with return code {}".format(ret))
else:
data = open(tf.name).read()
data = open(tf.name, 'rb').read()
progress_updater.finish(data)
os.remove(tf.name)
return data
Expand Down
2 changes: 1 addition & 1 deletion sopaper/lib/pdf2html.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(self, data, filename):
if data is not None:
f = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
f.close()
with open(f.name, 'w') as fpdf:
with open(f.name, 'wb') as fpdf:
fpdf.write(data)
self.fname = f.name
self.createfile = True
Expand Down
2 changes: 1 addition & 1 deletion sopaper/lib/pdfutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def pdf_compress(data):
log_err("Compress: ps2pdf14 failed!")
newdata = None
else:
newdata = open(f2.name).read()
newdata = open(f2.name, 'rb').read()
file_succ = newdata is not None and \
check_file_type(f2.name, 'PDF document') and \
len(newdata) >= ukconfig.FILE_SIZE_MINIMUM
Expand Down
4 changes: 2 additions & 2 deletions sopaper/lib/textutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ def levenshtein(s1, s2):
def title_correct(query, title):
""" return (match, update) """
title = title.replace('[PDF]', '')
q = ''.join([t for t in query if t in string.letters]).lower()
now = ''.join([t for t in title if t in string.letters]).lower()
q = ''.join([t for t in query if t in string.ascii_letters]).lower()
now = ''.join([t for t in title if t in string.ascii_letters]).lower()
ed_thres = min(len(query), len(title)) / 5
ERROR_RATIO = 0.6
if levenshtein(q, now) < ed_thres:
Expand Down
35 changes: 27 additions & 8 deletions sopaper/lib/ukutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,30 @@ def ensure_unicode_anytype(s):

def ensure_unicode(s):
"""assert type of s is basestring and convert s to unicode"""
assert isinstance(s, str), 's should be string' + str(s)
if isinstance(s, str):
s = s.decode('utf-8')
if isinstance(s, bytes):
return s.decode('utf-8')
assert isinstance(s, str)
# In python 3 all strings are assumed to be unicode
return s

# Python 2 legacy:
# assert isinstance(s, str), 's should be string' + str(s)
# if isinstance(s, str):
# s = s.decode('utf-8')
# return s

def ensure_bin_str(s):
"""assert type of s is basestring and convert s to byte string"""
assert isinstance(s, str), 's should be string'
if isinstance(s, str):
s = s.encode('utf-8')
return s
if isinstance(s, bytes):
return s
assert isinstance(s, str)
return s.encode('utf-8')

# Python 2 legacy:
# assert isinstance(s, str), 's should be string'
# if isinstance(s, str):
# s = s.encode('utf-8')
# return s

def import_all_modules(file_path, pkg_name):
"""import all modules recursively in a package
Expand All @@ -51,6 +64,9 @@ def import_all_modules(file_path, pkg_name):
import_module(module_name)

def check_buf_filetype(buf, need_type):
if isinstance(need_type, str):
need_type = need_type.encode('utf-8')

if ukconfig.USE_MAGIC_LIB:
s = magic.from_buffer(buf)
else:
Expand All @@ -67,11 +83,14 @@ def check_buf_filetype(buf, need_type):
return False

def check_file_type(fname, need_type):
if isinstance(need_type, str):
need_type = need_type.encode('utf-8')

s = Popen('file "{0}"'.format(fname), stdout=PIPE, shell=True).stdout.read()
if s.find(need_type) != -1:
return True
return False


if __name__ == '__main__':
print(check_filetype(open("./ukconfig.py").read(), 'PDF'))
print(check_filetype(open("./ukconfig.py", 'rb').read(), 'PDF'))
2 changes: 1 addition & 1 deletion sopaper/pdfprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def postprocess(data, ctx, pid):
if __name__ == '__main__':
import sys
f = sys.argv[1]
data = open(f).read()
data = open(f, 'rb').read()

text = pdf2text(data)
print(text)

0 comments on commit fad89a3

Please sign in to comment.