Skip to content

Commit

Permalink
update to v0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
letiantian committed Dec 15, 2015
1 parent 614a91e commit 7234885
Show file tree
Hide file tree
Showing 21 changed files with 623 additions and 378 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
build
textrank4zh/__pycache__
*.pyc
5 changes: 5 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
### 2014

主要功能的实现。

### 2015
34 changes: 34 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
from distutils.core import setup
LONGDOC = """
Please go to https://github.com/someus/TextRank4ZH for more info.
"""

setup(
name='textrank4zh',
version='0.2',
description='Extract keywords and abstract Chinese article',
long_description=LONGDOC,
author='Letian Sun',
author_email='[email protected]',
url='https://github.com/someus/TextRank4ZH',
license="MIT",
classifiers=[
'Intended Audience :: Developers',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Natural Language :: Chinese (Simplified)',
'Natural Language :: Chinese (Traditional)',
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Topic :: Text Processing',
'Topic :: Text Processing :: Linguistic',
],
keywords='NLP,Chinese,Keywords extraction, Abstract extraction',
install_requires=['jieba >= 0.35', 'numpy >= 1.7.1', 'networkx >= 1.9.1'],
packages=['textrank4zh'],
package_dir={'textrank4zh':'textrank4zh'},
package_data={'textrank4zh':['*.txt',]},
)
6 changes: 5 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,8 @@
tr4s.train(text=text, speech_tag_filter=True, lower=True, source = 'all_filters')

print '摘要:'
print '\n'.join(tr4s.get_key_sentences(num=3)) # 重要性最高的三个句子
print '\n'.join(tr4s.get_key_sentences(num=3)) # 重要性最高的三个句子

import os
print __file__
print os.path.dirname(os.path.realpath(__file__))
37 changes: 37 additions & 0 deletions test/Segmentation_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#-*- encoding:utf-8 -*-
from __future__ import (absolute_import, division, print_function,
unicode_literals)

import sys
import codecs
from textrank4zh import Segmentation

seg = Segmentation.Segmentation()

# text = codecs.open('../text/01.txt', 'r', 'utf-8', 'ignore').read()
text = "视频里,我们的杰宝热情地用英文和全场观众打招呼并清唱了一段《Heal The World》。我们的世界充满了未知数。"

result = seg.segment(text=text, lower=True)

for key in result:
print(key)

print(20*'#')
for s in result['sentences']:
print(s)

print(20*'*')
for s in result.sentences:
print (s)

print
for ss in result.words_no_filter:
print( ' '.join(ss) )

print
for ss in result.words_no_stop_words:
print( ' / '.join(ss) )

print
for ss in result.words_all_filters:
print (' | '.join(ss) )
20 changes: 20 additions & 0 deletions test/TextRank4Keyword_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#-*- encoding:utf-8 -*-
from __future__ import print_function

import sys
import codecs
from textrank4zh import TextRank4Keyword

# text = codecs.open('../text/02.txt', 'r', 'utf-8').read()
text = "世界的美好。世界美国英国。 世界和平。"

tr4w = TextRank4Keyword()
tr4w.analyze(text=text,lower=True, window=3, pagerank_config={'alpha':0.85})

for item in tr4w.get_keywords(30, word_min_len=2):
print(item.word, item.weight)

print('--phrase--')

for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num = 0):
print(phrase)
17 changes: 17 additions & 0 deletions test/TextRank4Sentence_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#-*- encoding:utf-8 -*-
from __future__ import print_function

import sys
import codecs
from textrank4zh import TextRank4Sentence

text = codecs.open('./doc/03.txt', 'r', 'utf-8').read()
text = "这间酒店位于北京东三环,里面摆放很多雕塑,文艺气息十足。答谢宴于晚上8点开始。"
tr4s = TextRank4Sentence()
tr4s.analyze(text=text, lower=True, source = 'all_filters')

print( '\n'.join(tr4s.sentences) )

print(20*'*')
for item in tr4s.get_key_sentences(num=4):
print(item.weight, item.sentence)
7 changes: 7 additions & 0 deletions test/codecs_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#-*- encoding:utf-8 -*-
from __future__ import (absolute_import, division, print_function,
unicode_literals)

import codecs
text = codecs.open('./doc/01.txt', 'r', 'utf-8', 'ignore').read()
print( type(text) ) # in py2 is unicode, py3 is str
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
10 changes: 10 additions & 0 deletions test/jieba_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#-*- encoding:utf-8 -*-
from __future__ import (absolute_import, division, print_function,
unicode_literals)

import jieba.posseg as pseg
words = pseg.cut("我爱北京天安门.。;‘你的#")
for w in words:
print('{0} {1}'.format(w.word, w.flag))
print(type(w.word)) # in py2 is unicode, py3 is str

25 changes: 25 additions & 0 deletions test/util_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#-*- encoding:utf-8 -*-
from __future__ import (absolute_import, division, print_function,
unicode_literals)

import sys

from textrank4zh import util

def testAttrDict():
r = util.AttrDict(a=2)
print( r )
print( r.a )
print( r['a'] )

def testCombine():
print(20*'*')
for item in util.combine(['a', 'b', 'c', 'd'], 2):
print(item)
print
for item in util.combine(['a', 'b', 'c', 'd'], 3):
print (item)

if __name__ == "__main__":
testAttrDict()
testCombine()
Loading

0 comments on commit 7234885

Please sign in to comment.