allow ignoring word frequency while providing pos tag

ateddy · May 23, 2015 · 3b76328 · 3b76328
1 parent 3ec4c43
commit 3b76328
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -90,10 +90,19 @@ print(", ".join(seg_list))
 
 * 开发者可以指定自己自定义的词典，以便包含 jieba 词库里没有的词。虽然 jieba 有新词识别能力，但是自行添加新词可以保证更高的正确率
 * 用法： jieba.load_userdict(file_name) # file_name 为自定义词典的路径
-* 词典格式和`dict.txt`一样，一个词占一行；每一行分三部分，一部分为词语，另一部分为词频（可省略），最后为词性（可省略），用空格隔开
-* 词频可省略，使用计算出的能保证分出该词的词频
+* 词典格式和 `dict.txt` 一样，一个词占一行；每一行分三部分：词语、词频（可省略）、词性（可省略），用空格隔开，顺序不可颠倒。
+* 词频省略时使用自动计算的能保证分出该词的词频。
 
-* 更改分词器的 tmp_dir 和 cache_file 属性，可指定缓存文件位置，用于受限的文件系统。
+**例如：**
+
+```
+创新办 3 i
+云计算 5
+凱特琳 nz
+台中
+```
+
+* 更改分词器（默认为 jieba.dt）的 tmp_dir 和 cache_file 属性，可指定缓存文件位置，用于受限的文件系统。
 
 * 范例：
 
@@ -506,13 +515,24 @@ Output:
 
 ###　Load dictionary
 
-* Developers can specify their own custom dictionary to be included in the jieba default dictionary. Jieba is able to identify new words, but adding your own new words can ensure a higher accuracy.
+* Developers can specify their own custom dictionary to be included in the jieba default dictionary. Jieba is able to identify new words, but you can add your own new words can ensure a higher accuracy.
 * Usage： `jieba.load_userdict(file_name) # file_name is the path of the custom dictionary`
-* The dictionary format is the same as that of `analyse/idf.txt`: one word per line; each line is divided into two parts, the first is the word itself, the other is the word frequency, separated by a space
-* The word frequency can be omitted, then a calculated value will be used.
+* The dictionary format is the same as that of `dict.txt`: one word per line; each line is divided into three parts separated by a space: word, word frequency, POS tag.
+* The word frequency and POS tag can be omitted respectively. The word frequency will be filled with a suitable value if omitted.
+
+**For example:**
+
+```
+创新办 3 i
+云计算 5
+凱特琳 nz
+台中
+```
+
+
 * Change a Tokenizer's `tmp_dir` and `cache_file` to specify the path of the cache file, for using on a restricted file system.
 
-* Example：
+* Example:
 
         云计算 5
         李小福 2

diff --git a/jieba/__init__.py b/jieba/__init__.py
@@ -362,7 +362,15 @@ def load_userdict(self, f):
                 if not line:
                     continue
                 tup = line.split(" ")
-                self.add_word(*tup)
+                freq, tag = None, None
+                if len(tup) == 2:
+                    if tup[1].isdigit():
+                        freq = tup[1]
+                    else:
+                        tag = tup[1]
+                elif len(tup) > 2:
+                    freq, tag = tup[1], tup[2]
+                self.add_word(tup[0], freq, tag)
             except Exception:
                 raise ValueError(
                     'invalid dictionary entry in %s at Line %s: %s' % (
@@ -377,13 +385,10 @@ def add_word(self, word, freq=None, tag=None):
         """
         self.check_initialized()
         word = strdecode(word)
-        if freq is None:
-            freq = self.suggest_freq(word, False)
-        else:
-            freq = int(freq)
+        freq = int(freq) if freq else self.suggest_freq(word, False)
         self.FREQ[word] = freq
         self.total += freq
-        if tag is not None:
+        if tag:
             self.user_word_tag_tab[word] = tag
         for ch in xrange(len(word)):
             wfrag = word[:ch + 1]
@@ -475,7 +480,7 @@ def set_dictionary(self, dictionary_path):
 
 # global functions
 
-FREQ = dt.FREQ
+get_FREQ = lambda k, d=None: dt.FREQ.get(k, d)
 add_word = dt.add_word
 calc = dt.calc
 cut = dt.cut

diff --git a/test/test_userdict.py b/test/test_userdict.py
@@ -43,6 +43,6 @@
 for sent, seg in testlist:
     print('/'.join(jieba.cut(sent, HMM=False)))
     word = ''.join(seg)
-    print('%s Before: %s, After: %s' % (word, jieba.FREQ[word], jieba.suggest_freq(seg, True)))
+    print('%s Before: %s, After: %s' % (word, jieba.get_FREQ(word), jieba.suggest_freq(seg, True)))
     print('/'.join(jieba.cut(sent, HMM=False)))
     print("-"*40)
diff --git a/test/userdict.txt b/test/userdict.txt
@@ -6,3 +6,4 @@ easy_install 3 eng
 韩玉赏鉴 3 nz
 八一双鹿 3 nz
 台中
+凱特琳 nz