|
| 1 | +#!/usr/bin/env python |
| 2 | +# -*- coding:UTF-8 |
| 3 | +__author__ = 'shenshijun' |
| 4 | +""" |
| 5 | +首先计算pattern字符串的hash值,然后在从目标字符串的开头,计算相同长度字符串的hash值。若hash值相同,则表示匹配,若不同,则向右移动一位,计算新的hash值。整个过程,与暴力的字符串匹配算法很相似, |
| 6 | +但由于计算hash值时,可以利用上一次的hash值,从而使新的hash值只需要加上新字母的计算,并减去上一次的第一个字母的计算,即可。 |
| 7 | +Rabin-Karp算法的预处理时间为O(m),最坏情况下该算法的匹配时间为O((n-m+1)m),期望复杂度O(m+n) |
| 8 | +""" |
| 9 | + |
| 10 | + |
| 11 | +def match(origin, pattern): |
| 12 | + pattern_len = len(pattern) |
| 13 | + |
| 14 | + def _hash(string, start=0): |
| 15 | + hash_code = 0 |
| 16 | + for x in xrange(pattern_len): |
| 17 | + hash_code += ord(string[start + x]) * 2 ** (pattern_len - x - 1) |
| 18 | + return hash_code |
| 19 | + |
| 20 | + def _refresh(old_hash, old_char, new_char): |
| 21 | + return (old_hash - ord(old_char) * 2 ** (pattern_len - 1)) * 2 + ord(new_char) |
| 22 | + |
| 23 | + def test_equal(start_index): |
| 24 | + for x in xrange(pattern_len): |
| 25 | + if origin[x + start_index] != pattern[x]: |
| 26 | + return False |
| 27 | + return True |
| 28 | + |
| 29 | + origin_index = 0 |
| 30 | + pattern_hash = _hash(pattern) |
| 31 | + origin_hash = _hash(origin) |
| 32 | + while origin_index < len(origin) - pattern_len - 1: |
| 33 | + if pattern_hash == origin_hash and test_equal(origin_index): |
| 34 | + return origin_index |
| 35 | + else: |
| 36 | + print "origin hash:%s,pattern hash:%s" % (origin_hash, pattern_hash) |
| 37 | + origin_hash = _refresh(origin_hash, origin[origin_index], origin[origin_index + pattern_len]) |
| 38 | + origin_index += 1 |
| 39 | + |
| 40 | + |
| 41 | +def main(): |
| 42 | + print match("sbsfsdgdgfgasbssssfsfdfeferf", 'sb') |
| 43 | + |
| 44 | + |
| 45 | +if __name__ == "__main__": |
| 46 | + main() |
0 commit comments