forked from letiantian/TextRank4ZH
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
614a91e
commit 7234885
Showing
21 changed files
with
623 additions
and
378 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
build | ||
textrank4zh/__pycache__ | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
### 2014 | ||
|
||
主要功能的实现。 | ||
|
||
### 2015 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# -*- coding: utf-8 -*- | ||
from distutils.core import setup | ||
LONGDOC = """ | ||
Please go to https://github.com/someus/TextRank4ZH for more info. | ||
""" | ||
|
||
setup( | ||
name='textrank4zh', | ||
version='0.2', | ||
description='Extract keywords and abstract Chinese article', | ||
long_description=LONGDOC, | ||
author='Letian Sun', | ||
author_email='[email protected]', | ||
url='https://github.com/someus/TextRank4ZH', | ||
license="MIT", | ||
classifiers=[ | ||
'Intended Audience :: Developers', | ||
'License :: OSI Approved :: MIT License', | ||
'Operating System :: OS Independent', | ||
'Natural Language :: Chinese (Simplified)', | ||
'Natural Language :: Chinese (Traditional)', | ||
'Programming Language :: Python :: 2', | ||
'Programming Language :: Python :: 2.7', | ||
'Programming Language :: Python :: 3', | ||
'Programming Language :: Python :: 3.4', | ||
'Topic :: Text Processing', | ||
'Topic :: Text Processing :: Linguistic', | ||
], | ||
keywords='NLP,Chinese,Keywords extraction, Abstract extraction', | ||
install_requires=['jieba >= 0.35', 'numpy >= 1.7.1', 'networkx >= 1.9.1'], | ||
packages=['textrank4zh'], | ||
package_dir={'textrank4zh':'textrank4zh'}, | ||
package_data={'textrank4zh':['*.txt',]}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#-*- encoding:utf-8 -*- | ||
from __future__ import (absolute_import, division, print_function, | ||
unicode_literals) | ||
|
||
import sys | ||
import codecs | ||
from textrank4zh import Segmentation | ||
|
||
seg = Segmentation.Segmentation() | ||
|
||
# text = codecs.open('../text/01.txt', 'r', 'utf-8', 'ignore').read() | ||
text = "视频里,我们的杰宝热情地用英文和全场观众打招呼并清唱了一段《Heal The World》。我们的世界充满了未知数。" | ||
|
||
result = seg.segment(text=text, lower=True) | ||
|
||
for key in result: | ||
print(key) | ||
|
||
print(20*'#') | ||
for s in result['sentences']: | ||
print(s) | ||
|
||
print(20*'*') | ||
for s in result.sentences: | ||
print (s) | ||
|
||
for ss in result.words_no_filter: | ||
print( ' '.join(ss) ) | ||
|
||
for ss in result.words_no_stop_words: | ||
print( ' / '.join(ss) ) | ||
|
||
for ss in result.words_all_filters: | ||
print (' | '.join(ss) ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#-*- encoding:utf-8 -*- | ||
from __future__ import print_function | ||
|
||
import sys | ||
import codecs | ||
from textrank4zh import TextRank4Keyword | ||
|
||
# text = codecs.open('../text/02.txt', 'r', 'utf-8').read() | ||
text = "世界的美好。世界美国英国。 世界和平。" | ||
|
||
tr4w = TextRank4Keyword() | ||
tr4w.analyze(text=text,lower=True, window=3, pagerank_config={'alpha':0.85}) | ||
|
||
for item in tr4w.get_keywords(30, word_min_len=2): | ||
print(item.word, item.weight) | ||
|
||
print('--phrase--') | ||
|
||
for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num = 0): | ||
print(phrase) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#-*- encoding:utf-8 -*- | ||
from __future__ import print_function | ||
|
||
import sys | ||
import codecs | ||
from textrank4zh import TextRank4Sentence | ||
|
||
text = codecs.open('./doc/03.txt', 'r', 'utf-8').read() | ||
text = "这间酒店位于北京东三环,里面摆放很多雕塑,文艺气息十足。答谢宴于晚上8点开始。" | ||
tr4s = TextRank4Sentence() | ||
tr4s.analyze(text=text, lower=True, source = 'all_filters') | ||
|
||
print( '\n'.join(tr4s.sentences) ) | ||
|
||
print(20*'*') | ||
for item in tr4s.get_key_sentences(num=4): | ||
print(item.weight, item.sentence) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#-*- encoding:utf-8 -*- | ||
from __future__ import (absolute_import, division, print_function, | ||
unicode_literals) | ||
|
||
import codecs | ||
text = codecs.open('./doc/01.txt', 'r', 'utf-8', 'ignore').read() | ||
print( type(text) ) # in py2 is unicode, py3 is str |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#-*- encoding:utf-8 -*- | ||
from __future__ import (absolute_import, division, print_function, | ||
unicode_literals) | ||
|
||
import jieba.posseg as pseg | ||
words = pseg.cut("我爱北京天安门.。;‘你的#") | ||
for w in words: | ||
print('{0} {1}'.format(w.word, w.flag)) | ||
print(type(w.word)) # in py2 is unicode, py3 is str | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#-*- encoding:utf-8 -*- | ||
from __future__ import (absolute_import, division, print_function, | ||
unicode_literals) | ||
|
||
import sys | ||
|
||
from textrank4zh import util | ||
|
||
def testAttrDict(): | ||
r = util.AttrDict(a=2) | ||
print( r ) | ||
print( r.a ) | ||
print( r['a'] ) | ||
|
||
def testCombine(): | ||
print(20*'*') | ||
for item in util.combine(['a', 'b', 'c', 'd'], 2): | ||
print(item) | ||
for item in util.combine(['a', 'b', 'c', 'd'], 3): | ||
print (item) | ||
|
||
if __name__ == "__main__": | ||
testAttrDict() | ||
testCombine() |
Oops, something went wrong.