always read as byte

Summary: I think we have to open file as byte explicitly. [Specify when opening a file as binary](https://docs.python.org/3.3/howto/pyporting.html#specify-when-opening-a-file-as-binary) Always read as byte and decode it is good practice. I encountered encoding problem when using Japanese dataset, then this fix solve the problem. Closes facebookresearch#70 Reviewed By: piotr-bojanowski Differential Revision: D3900225 Pulled By: EdouardGrave fbshipit-source-id: e0b8b2b545f89ff75612f321ed3ba808bdea031e
ricleite · Sep 23, 2016 · 3b4579f · 3b4579f
1 parent b480e11
commit 3b4579f
Showing 1 changed file with 3 additions and 6 deletions.
diff --git a/eval.py b/eval.py
@@ -21,10 +21,7 @@
 import argparse
 
 def compat_splitting(line):
-    if sys.version > "3":
-        return line.split()
-    else: # if version is 2
-        return line.decode('utf8').split()
+    return line.decode('utf8').split()
 
 def similarity(v1, v2):
     n1 = np.linalg.norm(v1)
@@ -37,7 +34,7 @@ def similarity(v1, v2):
 args = parser.parse_args()
 
 vectors = {}
-fin = open(args.modelPath, 'r')
+fin = open(args.modelPath, 'rb')
 for i, line in enumerate(fin):
     try:
         tab = compat_splitting(line)
@@ -56,7 +53,7 @@ def similarity(v1, v2):
 drop = 0.0
 nwords = 0.0
 
-fin = open(args.dataPath, 'r')
+fin = open(args.dataPath, 'rb')
 for line in fin:
     tline = compat_splitting(line)
     word1 = tline[0].lower()