Downloadable

ppwwyyxx · May 10, 2023 · fad89a3 · fad89a3
1 parent ba76fcc
commit fad89a3
Show file tree

Hide file tree

Showing 8 changed files with 37 additions and 18 deletions.
diff --git a/manage/dump-db.py b/manage/dump-db.py
@@ -23,14 +23,14 @@ def dump(pid, output):
         os.mkdir(OUTPUT)
     except:
         pass
-    fout = open(os.path.join(OUTPUT, title + '.pdf'), 'w')
+    fout = open(os.path.join(OUTPUT, title + '.pdf'), 'wb')
     fout.write(pdf)
     fout.close()
 
     npage = doc.get('page')
     if npage:
         for i in range(npage + 1):
-            fout = open(os.path.join(OUTPUT, title + '.html.{0}'.format(i)), 'w')
+            fout = open(os.path.join(OUTPUT, title + '.html.{0}'.format(i)), 'wb')
             fout.write(doc['html'][i])
             fout.close()
 

diff --git a/pdf-compress.py b/pdf-compress.py
@@ -22,12 +22,12 @@ def get_args():
 def main():
     global args
     args = get_args()
-    data = open(args.file).read()
+    data = open(args.file, 'rb').read()
     newdata = pdf_compress(data)
 
     if len(newdata) < len(data):
         newfilename = args.file + '.compressed'
-        with open(newfilename, 'w') as fout:
+        with open(newfilename, 'wb') as fout:
             fout.write(newdata)
         os.remove(args.file)
         os.rename(newfilename, args.file)

diff --git a/sopaper/lib/downloader.py b/sopaper/lib/downloader.py
@@ -58,7 +58,7 @@ def wget_download(url, progress_updater, headers=None):
             os.remove(tf.name)
         raise FileCorrupted("wget failed with return code {}".format(ret))
     else:
-        data = open(tf.name).read()
+        data = open(tf.name, 'rb').read()
         progress_updater.finish(data)
         os.remove(tf.name)
     return data

diff --git a/sopaper/lib/pdf2html.py b/sopaper/lib/pdf2html.py
@@ -15,7 +15,7 @@ def __init__(self, data, filename):
         if data is not None:
             f = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
             f.close()
-            with open(f.name, 'w') as fpdf:
+            with open(f.name, 'wb') as fpdf:
                 fpdf.write(data)
             self.fname = f.name
             self.createfile = True

diff --git a/sopaper/lib/pdfutil.py b/sopaper/lib/pdfutil.py
@@ -73,7 +73,7 @@ def pdf_compress(data):
         log_err("Compress: ps2pdf14 failed!")
         newdata = None
     else:
-        newdata = open(f2.name).read()
+        newdata = open(f2.name, 'rb').read()
     file_succ = newdata is not None and \
             check_file_type(f2.name, 'PDF document') and \
             len(newdata) >= ukconfig.FILE_SIZE_MINIMUM

diff --git a/sopaper/lib/textutil.py b/sopaper/lib/textutil.py
@@ -65,8 +65,8 @@ def levenshtein(s1, s2):
 def title_correct(query, title):
     """ return (match, update) """
     title = title.replace('[PDF]', '')
-    q = ''.join([t for t in query if t in string.letters]).lower()
-    now = ''.join([t for t in title if t in string.letters]).lower()
+    q = ''.join([t for t in query if t in string.ascii_letters]).lower()
+    now = ''.join([t for t in title if t in string.ascii_letters]).lower()
     ed_thres = min(len(query), len(title)) / 5
     ERROR_RATIO = 0.6
     if levenshtein(q, now) < ed_thres:

diff --git a/sopaper/lib/ukutil.py b/sopaper/lib/ukutil.py
@@ -29,17 +29,30 @@ def ensure_unicode_anytype(s):
 
 def ensure_unicode(s):
     """assert type of s is basestring and convert s to unicode"""
-    assert isinstance(s, str), 's should be string' + str(s)
-    if isinstance(s, str):
-        s = s.decode('utf-8')
+    if isinstance(s, bytes):
+        return s.decode('utf-8')
+    assert isinstance(s, str)
+    # In python 3 all strings are assumed to be unicode
     return s
 
+    # Python 2 legacy:
+    # assert isinstance(s, str), 's should be string' + str(s)
+    # if isinstance(s, str):
+    #     s = s.decode('utf-8')
+    # return s
+
 def ensure_bin_str(s):
     """assert type of s is basestring and convert s to byte string"""
-    assert isinstance(s, str), 's should be string'
-    if isinstance(s, str):
-        s = s.encode('utf-8')
-    return s
+    if isinstance(s, bytes):
+        return s
+    assert isinstance(s, str)
+    return s.encode('utf-8')
+
+    # Python 2 legacy:
+    # assert isinstance(s, str), 's should be string'
+    # if isinstance(s, str):
+    #     s = s.encode('utf-8')
+    # return s
 
 def import_all_modules(file_path, pkg_name):
     """import all modules recursively in a package
@@ -51,6 +64,9 @@ def import_all_modules(file_path, pkg_name):
         import_module(module_name)
 
 def check_buf_filetype(buf, need_type):
+    if isinstance(need_type, str):
+        need_type = need_type.encode('utf-8')
+
     if ukconfig.USE_MAGIC_LIB:
         s = magic.from_buffer(buf)
     else:
@@ -67,11 +83,14 @@ def check_buf_filetype(buf, need_type):
         return False
 
 def check_file_type(fname, need_type):
+    if isinstance(need_type, str):
+        need_type = need_type.encode('utf-8')
+
     s = Popen('file "{0}"'.format(fname), stdout=PIPE, shell=True).stdout.read()
     if s.find(need_type) != -1:
         return True
     return False
 
 
 if __name__ == '__main__':
-    print(check_filetype(open("./ukconfig.py").read(), 'PDF'))
+    print(check_filetype(open("./ukconfig.py", 'rb').read(), 'PDF'))
diff --git a/sopaper/pdfprocess.py b/sopaper/pdfprocess.py
@@ -82,7 +82,7 @@ def postprocess(data, ctx, pid):
 if __name__ == '__main__':
     import sys
     f = sys.argv[1]
-    data = open(f).read()
+    data = open(f, 'rb').read()
 
     text = pdf2text(data)
     print(text)