update to v0.2

marrygit · Dec 15, 2015 · 7234885 · 7234885
1 parent 614a91e
commit 7234885
Show file tree

Hide file tree

Showing 21 changed files with 623 additions and 378 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+build
+textrank4zh/__pycache__
+*.pyc
diff --git a/HISTORY.md b/HISTORY.md
@@ -0,0 +1,5 @@
+### 2014
+
+主要功能的实现。
+
+### 2015
diff --git a/setup.py b/setup.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+from distutils.core import setup
+LONGDOC = """
+Please go to https://github.com/someus/TextRank4ZH for more info.
+"""
+
+setup(
+    name='textrank4zh',
+    version='0.2',
+    description='Extract keywords and abstract Chinese article',
+    long_description=LONGDOC,
+    author='Letian Sun',
+    author_email='[email protected]',
+    url='https://github.com/someus/TextRank4ZH',
+    license="MIT",
+    classifiers=[
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+        'Natural Language :: Chinese (Simplified)',
+        'Natural Language :: Chinese (Traditional)',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Topic :: Text Processing',
+        'Topic :: Text Processing :: Linguistic',
+    ],
+    keywords='NLP,Chinese,Keywords extraction, Abstract extraction',
+    install_requires=['jieba >= 0.35', 'numpy >= 1.7.1', 'networkx >= 1.9.1'],
+    packages=['textrank4zh'],
+    package_dir={'textrank4zh':'textrank4zh'},
+    package_data={'textrank4zh':['*.txt',]},
+)
diff --git a/test.py b/test.py
@@ -31,4 +31,8 @@
 tr4s.train(text=text, speech_tag_filter=True, lower=True, source = 'all_filters')
 
 print '摘要：'
-print '\n'.join(tr4s.get_key_sentences(num=3)) # 重要性最高的三个句子
+print '\n'.join(tr4s.get_key_sentences(num=3)) # 重要性最高的三个句子
+
+import os
+print __file__
+print os.path.dirname(os.path.realpath(__file__))
diff --git a/test/Segmentation_test.py b/test/Segmentation_test.py
@@ -0,0 +1,37 @@
+#-*- encoding:utf-8 -*-
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+import codecs
+from textrank4zh import Segmentation
+
+seg = Segmentation.Segmentation()
+
+# text = codecs.open('../text/01.txt', 'r', 'utf-8', 'ignore').read()
+text = "视频里，我们的杰宝热情地用英文和全场观众打招呼并清唱了一段《Heal The World》。我们的世界充满了未知数。"
+
+result = seg.segment(text=text, lower=True)
+
+for key in result:
+    print(key)
+
+print(20*'#')
+for s in result['sentences']:
+    print(s)
+
+print(20*'*')
+for s in result.sentences:
+    print (s)
+
+print
+for ss in result.words_no_filter:
+    print( '  '.join(ss) )
+
+print
+for ss in result.words_no_stop_words:
+    print( ' / '.join(ss) )
+
+print
+for ss in result.words_all_filters:
+    print (' | '.join(ss) )
diff --git a/test/TextRank4Keyword_test.py b/test/TextRank4Keyword_test.py
@@ -0,0 +1,20 @@
+#-*- encoding:utf-8 -*-
+from __future__ import print_function
+
+import sys
+import codecs
+from textrank4zh import TextRank4Keyword
+
+# text = codecs.open('../text/02.txt', 'r', 'utf-8').read()
+text = "世界的美好。世界美国英国。 世界和平。"
+
+tr4w = TextRank4Keyword()
+tr4w.analyze(text=text,lower=True, window=3, pagerank_config={'alpha':0.85})
+
+for item in tr4w.get_keywords(30, word_min_len=2):
+    print(item.word, item.weight)
+
+print('--phrase--')
+
+for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num = 0):
+    print(phrase)
diff --git a/test/TextRank4Sentence_test.py b/test/TextRank4Sentence_test.py
@@ -0,0 +1,17 @@
+#-*- encoding:utf-8 -*-
+from __future__ import print_function
+
+import sys
+import codecs
+from textrank4zh import TextRank4Sentence
+
+text = codecs.open('./doc/03.txt', 'r', 'utf-8').read()
+text = "这间酒店位于北京东三环，里面摆放很多雕塑，文艺气息十足。答谢宴于晚上8点开始。"
+tr4s = TextRank4Sentence()
+tr4s.analyze(text=text, lower=True, source = 'all_filters')
+
+print( '\n'.join(tr4s.sentences) )
+
+print(20*'*')
+for item in tr4s.get_key_sentences(num=4):
+    print(item.weight, item.sentence)
diff --git a/test/codecs_test.py b/test/codecs_test.py
@@ -0,0 +1,7 @@
+#-*- encoding:utf-8 -*-
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import codecs
+text = codecs.open('./doc/01.txt', 'r', 'utf-8', 'ignore').read()
+print( type(text) )  # in py2 is unicode, py3 is str
diff --git a/text/01.txt → test/doc/01.txt b/text/01.txt → test/doc/01.txt
diff --git a/text/02.txt → test/doc/02.txt b/text/02.txt → test/doc/02.txt
diff --git a/text/03.txt → test/doc/03.txt b/text/03.txt → test/doc/03.txt
diff --git a/text/04.txt → test/doc/04.txt b/text/04.txt → test/doc/04.txt
diff --git a/text/05.txt → test/doc/05.txt b/text/05.txt → test/doc/05.txt
diff --git a/test/jieba_test.py b/test/jieba_test.py
@@ -0,0 +1,10 @@
+#-*- encoding:utf-8 -*-
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import jieba.posseg as pseg
+words = pseg.cut("我爱北京天安门.。；‘你的#")
+for w in words:
+    print('{0} {1}'.format(w.word, w.flag))
+    print(type(w.word))  # in py2 is unicode, py3 is str
+
diff --git a/test/util_test.py b/test/util_test.py
@@ -0,0 +1,25 @@
+#-*- encoding:utf-8 -*-
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import sys
+
+from textrank4zh import util
+
+def testAttrDict():
+    r = util.AttrDict(a=2)
+    print( r )
+    print( r.a )
+    print( r['a'] )
+
+def testCombine():
+    print(20*'*')
+    for item in util.combine(['a', 'b', 'c', 'd'], 2):
+        print(item)
+    print
+    for item in util.combine(['a', 'b', 'c', 'd'], 3):
+        print (item)
+
+if __name__ == "__main__":
+    testAttrDict()
+    testCombine()