Skip to content

Commit f3284c1

Browse files
committed
Integrated new algorithm into the stream class, it will now be chosen depending on the context to figure out which one to use. For some reason, the c version that is slow for big files really rocks when its about small files. Its better than the respective c implementation of the normal delta apply
1 parent 95d8202 commit f3284c1

File tree

2 files changed

+33
-23
lines changed

2 files changed

+33
-23
lines changed

stream.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222
zlib
2323
)
2424

25+
has_perf_mod = False
2526
try:
2627
from _perf import apply_delta as c_apply_delta
28+
has_perf_mod = True
2729
except ImportError:
2830
pass
2931

@@ -330,7 +332,7 @@ def __init__(self, stream_list):
330332
self._dstreams = tuple(stream_list[:-1])
331333
self._br = 0
332334

333-
def _set_cache_too_slow(self, attr):
335+
def _set_cache_too_slow_without_c(self, attr):
334336
# the direct algorithm is fastest and most direct if there is only one
335337
# delta. Also, the extra overhead might not be worth it for items smaller
336338
# than X - definitely the case in python, every function call costs
@@ -366,6 +368,15 @@ def _set_cache_too_slow(self, attr):
366368
self._mm_target.seek(0)
367369

368370
def _set_cache_(self, attr):
371+
"""Determine which version to use depending on the configuration of the deltas
372+
:note: we are only called if we have the performance module"""
373+
# otherwise it depends on the amount of memory to shift around
374+
if len(self._dstreams) > 1 and self._bstream.size < 150000:
375+
return self._set_cache_too_slow_without_c(attr)
376+
else:
377+
return self._set_cache_brute_(attr)
378+
379+
def _set_cache_brute_(self, attr):
369380
"""If we are here, we apply the actual deltas"""
370381

371382
buffer_info_list = list()
@@ -438,6 +449,13 @@ def _set_cache_(self, attr):
438449
self._mm_target = bbuf
439450
self._size = final_target_size
440451

452+
453+
#{ Configuration
454+
if not has_perf_mod:
455+
_set_cache_ = _set_cache_brute_
456+
457+
#} END configuration
458+
441459
def read(self, count=0):
442460
bl = self._size - self._br # bytes left
443461
if count < 1 or count > bl:
@@ -654,4 +672,7 @@ def close(self):
654672
def write(self, data):
655673
return len(data)
656674

675+
657676
#} END W streams
677+
678+

test/performance/test_pack.py

+11-22
Original file line numberDiff line numberDiff line change
@@ -25,29 +25,18 @@ def test_pack_random_access(self):
2525

2626
# sha lookup: best-case and worst case access
2727
pdb_pack_info = pdb._pack_info
28-
access_times = list()
29-
for rand in range(2):
30-
if rand:
31-
random.shuffle(sha_list)
32-
# END shuffle shas
33-
st = time()
34-
for sha in sha_list:
35-
pdb_pack_info(sha)
36-
# END for each sha to look up
37-
elapsed = time() - st
38-
access_times.append(elapsed)
39-
40-
# discard cache
41-
del(pdb._entities)
42-
pdb.entities()
43-
print >> sys.stderr, "PDB: looked up %i sha in %i packs (random=%i) in %f s ( %f shas/s )" % (ns, len(pdb.entities()), rand, elapsed, ns / elapsed)
44-
# END for each random mode
45-
elapsed_order, elapsed_rand = access_times
46-
47-
# well, its never really sequencial regarding the memory patterns, but it
48-
# shows how well the prioriy cache performs
49-
print >> sys.stderr, "PDB: sequential access is %f %% faster than random-access" % (100 - ((elapsed_order / elapsed_rand) * 100))
28+
# END shuffle shas
29+
st = time()
30+
for sha in sha_list:
31+
pdb_pack_info(sha)
32+
# END for each sha to look up
33+
elapsed = time() - st
5034

35+
# discard cache
36+
del(pdb._entities)
37+
pdb.entities()
38+
print >> sys.stderr, "PDB: looked up %i sha in %i packs in %f s ( %f shas/s )" % (ns, len(pdb.entities()), elapsed, ns / elapsed)
39+
# END for each random mode
5140

5241
# query info and streams only
5342
max_items = 10000 # can wait longer when testing memory

0 commit comments

Comments
 (0)