Integrated new algorithm into the stream class, it will now be chosen depending on the context to figure out which one to use. For some reason, the c version that is slow for big files really rocks when its about small files. Its better than the respective c implementation of the normal delta apply

Byron · Byron · commit f3284c1edf36 · 2010-10-13T19:21:48.000+02:00
diff --git a/stream.py b/stream.py
@@ -22,8 +22,10 @@
 		zlib
 	)
 
+has_perf_mod = False
 try:
 	from _perf import apply_delta as c_apply_delta
+	has_perf_mod = True
 except ImportError:
 	pass
 
@@ -330,7 +332,7 @@ def __init__(self, stream_list):
 		self._dstreams = tuple(stream_list[:-1])
 		self._br = 0
 		
-	def _set_cache_too_slow(self, attr):
+	def _set_cache_too_slow_without_c(self, attr):
 		# the direct algorithm is fastest and most direct if there is only one 
 		# delta. Also, the extra overhead might not be worth it for items smaller
 		# than X - definitely the case in python, every function call costs 
@@ -366,6 +368,15 @@ def _set_cache_too_slow(self, attr):
 		self._mm_target.seek(0)
 		
 	def _set_cache_(self, attr):
+		"""Determine which version to use depending on the configuration of the deltas
+		:note: we are only called if we have the performance module"""
+		# otherwise it depends on the amount of memory to shift around
+		if len(self._dstreams) > 1 and self._bstream.size < 150000:
+			return self._set_cache_too_slow_without_c(attr)
+		else:
+			return self._set_cache_brute_(attr)
+		
+	def _set_cache_brute_(self, attr):
 		"""If we are here, we apply the actual deltas"""
 		
 		buffer_info_list = list()
@@ -438,6 +449,13 @@ def _set_cache_(self, attr):
 		self._mm_target = bbuf
 		self._size = final_target_size
 		
+	
+	#{ Configuration
+	if not has_perf_mod:
+		_set_cache_ = _set_cache_brute_
+	
+	#} END configuration
+		
 	def read(self, count=0):
 		bl = self._size - self._br		# bytes left
 		if count < 1 or count > bl:
@@ -654,4 +672,7 @@ def close(self):
 	def write(self, data):
 		return len(data)
 
+
 #} END W streams
+
+
diff --git a/test/performance/test_pack.py b/test/performance/test_pack.py
@@ -25,29 +25,18 @@ def test_pack_random_access(self):
 		
 		# sha lookup: best-case and worst case access
 		pdb_pack_info = pdb._pack_info
-		access_times = list()
-		for rand in range(2):
-			if rand:
-				random.shuffle(sha_list)
-			# END shuffle shas
-			st = time()
-			for sha in sha_list:
-				pdb_pack_info(sha)
-			# END for each sha to look up
-			elapsed = time() - st
-			access_times.append(elapsed)
-			
-			# discard cache
-			del(pdb._entities)
-			pdb.entities()
-			print >> sys.stderr, "PDB: looked up %i sha in %i packs (random=%i) in %f s ( %f shas/s )" % (ns, len(pdb.entities()), rand, elapsed, ns / elapsed)
-		# END for each random mode
-		elapsed_order, elapsed_rand = access_times
-		
-		# well, its never really sequencial regarding the memory patterns, but it 
-		# shows how well the prioriy cache performs
-		print >> sys.stderr, "PDB: sequential access is %f %% faster than random-access" % (100 - ((elapsed_order / elapsed_rand) * 100))
+		# END shuffle shas
+		st = time()
+		for sha in sha_list:
+			pdb_pack_info(sha)
+		# END for each sha to look up
+		elapsed = time() - st
 		
+		# discard cache
+		del(pdb._entities)
+		pdb.entities()
+		print >> sys.stderr, "PDB: looked up %i sha in %i packs in %f s ( %f shas/s )" % (ns, len(pdb.entities()), elapsed, ns / elapsed)
+		# END for each random mode
 		
 		# query info and streams only
 		max_items = 10000			# can wait longer when testing memory