Skip to content

Commit e83210d

Browse files
committed
initial version of pack writing, which seems to work, but still needs some more testing and verification
1 parent 810d1e3 commit e83210d

File tree

5 files changed

+266
-22
lines changed

5 files changed

+266
-22
lines changed

gitdb/exc.py

+3
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ class BadObject(ODBError):
1717

1818
def __str__(self):
1919
return "BadObject: %s" % to_hex_sha(self.args[0])
20+
21+
class ParseError(ODBError):
22+
"""Thrown if the parsing of a file failed due to an invalid format"""
2023

2124
class AmbiguousObjectName(ODBError):
2225
"""Thrown if a possibly shortened name does not uniquely represent a single object

gitdb/fun.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848

4949
__all__ = ('is_loose_object', 'loose_object_header_info', 'msb_size', 'pack_object_header_info',
5050
'write_object', 'loose_object_header', 'stream_copy', 'apply_delta_data',
51-
'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList')
51+
'is_equal_canonical_sha', 'connect_deltas', 'DeltaChunkList', 'create_pack_object_header')
5252

5353

5454
#{ Structures
@@ -412,6 +412,24 @@ def pack_object_header_info(data):
412412
s += 7
413413
# END character loop
414414
return (type_id, size, i)
415+
416+
def create_pack_object_header(obj_type, obj_size):
417+
""":return: string defining the pack header comprised of the object type
418+
and its incompressed size in bytes
419+
:parmam obj_type: pack type_id of the object
420+
:param obj_size: uncompressed size in bytes of the following object stream"""
421+
c = 0 # 1 byte
422+
hdr = str() # output string
423+
424+
c = (obj_type << 4) | (obj_size & 0xf)
425+
obj_size >>= 4
426+
while obj_size:
427+
hdr += chr(c | 0x80)
428+
c = obj_size & 0x7f
429+
obj_size >>= 7
430+
#END until size is consumed
431+
hdr += chr(c)
432+
return hdr
415433

416434
def msb_size(data, offset=0):
417435
"""

gitdb/pack.py

+189-7
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
"""Contains PackIndexFile and PackFile implementations"""
66
from gitdb.exc import (
77
BadObject,
8-
UnsupportedOperation
8+
UnsupportedOperation,
9+
ParseError
910
)
1011
from util import (
1112
zlib,
@@ -15,6 +16,7 @@
1516
)
1617

1718
from fun import (
19+
create_pack_object_header,
1820
pack_object_header_info,
1921
is_equal_canonical_sha,
2022
type_id_to_type_map,
@@ -47,13 +49,16 @@
4749
DeltaApplyReader,
4850
Sha1Writer,
4951
NullStream,
52+
FlexibleSha1Writer
5053
)
5154

5255
from struct import (
5356
pack,
5457
unpack,
5558
)
5659

60+
from binascii import crc32
61+
5762
from itertools import izip
5863
import array
5964
import os
@@ -119,10 +124,113 @@ def pack_object_at(data, offset, as_stream):
119124
return abs_data_offset, ODeltaPackInfo(offset, type_id, uncomp_size, delta_info)
120125
# END handle info
121126
# END handle stream
122-
127+
128+
def write_stream_to_pack(read, write, zstream, want_crc=False):
129+
"""Copy a stream as read from read function, zip it, and write the result.
130+
Count the number of written bytes and return it
131+
:param want_crc: if True, the crc will be generated over the compressed data.
132+
:return: tuple(no bytes read, no bytes written, crc32) crc might be 0 if want_crc
133+
was false"""
134+
br = 0 # bytes read
135+
bw = 0 # bytes written
136+
crc = 0
137+
138+
while True:
139+
chunk = read(chunk_size)
140+
br += len(chunk)
141+
compressed = zstream.compress(chunk)
142+
bw += len(compressed)
143+
write(compressed) # cannot assume return value
144+
145+
if want_crc:
146+
crc = crc32(compressed, crc)
147+
#END handle crc
148+
149+
if len(chunk) != chunk_size:
150+
break
151+
#END copy loop
152+
153+
compressed = zstream.flush()
154+
bw += len(compressed)
155+
write(compressed)
156+
if want_crc:
157+
crc = crc32(compressed, crc)
158+
#END handle crc
159+
160+
return (br, bw, crc)
161+
162+
123163
#} END utilities
124164

125165

166+
class IndexWriter(object):
167+
"""Utility to cache index information, allowing to write all information later
168+
in one go to the given stream
169+
:note: currently only writes v2 indices"""
170+
__slots__ = '_objs'
171+
172+
def __init__(self):
173+
self._objs = list()
174+
175+
def append(self, binsha, crc, offset):
176+
"""Append one piece of object information"""
177+
self._objs.append((binsha, crc, offset))
178+
179+
def write(self, pack_binsha, write):
180+
"""Write the index file using the given write method
181+
:param pack_binsha: sha over the whole pack that we index"""
182+
# sort for sha1 hash
183+
self._objs.sort(key=lambda o: o[0])
184+
185+
sha_writer = FlexibleSha1Writer(write)
186+
sha_write = sha_writer.write
187+
sha_write(PackIndexFile.index_v2_signature)
188+
sha_write(pack(">L", PackIndexFile.index_version_default))
189+
190+
# fanout
191+
tmplist = list((0,)*256) # fanout or list with 64 bit offsets
192+
for t in self._objs:
193+
tmplist[ord(t[0][0])] += 1
194+
#END prepare fanout
195+
196+
for i in xrange(255):
197+
v = tmplist[i]
198+
sha_write(pack('>L', v))
199+
tmplist[i+1] = v
200+
#END write each fanout entry
201+
sha_write(pack('>L', tmplist[255]))
202+
203+
# sha1 ordered
204+
# save calls, that is push them into c
205+
sha_write(''.join(t[0] for t in self._objs))
206+
207+
# crc32
208+
for t in self._objs:
209+
sha_write(pack('>L', t[1]&0xffffffff))
210+
#END for each crc
211+
212+
tmplist = list()
213+
# offset 32
214+
for t in self._objs:
215+
ofs = t[2]
216+
if ofs > 0x7fffffff:
217+
tmplist.append(ofs)
218+
ofs = 0x80000000 + len(tmplist)-1
219+
#END hande 64 bit offsets
220+
sha_write(pack('>L', ofs&0xffffffff))
221+
#END for each offset
222+
223+
# offset 64
224+
for ofs in tmplist:
225+
sha_write(pack(">Q", ofs))
226+
#END for each offset
227+
228+
# trailer
229+
assert(len(pack_binsha) == 20)
230+
sha_write(pack_binsha)
231+
write(sha_writer.sha(as_hex=False))
232+
233+
126234

127235
class PackIndexFile(LazyMixin):
128236
"""A pack index provides offsets into the corresponding pack, allowing to find
@@ -135,6 +243,8 @@ class PackIndexFile(LazyMixin):
135243

136244
# used in v2 indices
137245
_sha_list_offset = 8 + 1024
246+
index_v2_signature = '\377tOc'
247+
index_version_default = 2
138248

139249
def __init__(self, indexpath):
140250
super(PackIndexFile, self).__init__()
@@ -155,7 +265,7 @@ def _set_cache_(self, attr):
155265
# to access the fanout table or related properties
156266

157267
# CHECK VERSION
158-
self._version = (self._data[:4] == '\377tOc' and 2) or 1
268+
self._version = (self._data[:4] == self.index_v2_signature and 2) or 1
159269
if self._version == 2:
160270
version_id = unpack_from(">L", self._data, 4)[0]
161271
assert version_id == self._version, "Unsupported index version: %i" % version_id
@@ -383,6 +493,8 @@ class PackFile(LazyMixin):
383493
case"""
384494

385495
__slots__ = ('_packpath', '_data', '_size', '_version')
496+
pack_signature = 0x5041434b # 'PACK'
497+
pack_version_default = 2
386498

387499
# offset into our data at which the first object starts
388500
first_object_offset = 3*4 # header bytes
@@ -396,15 +508,19 @@ def _set_cache_(self, attr):
396508
self._data = file_contents_ro_filepath(self._packpath)
397509

398510
# read the header information
399-
type_id, self._version, self._size = unpack_from(">4sLL", self._data, 0)
511+
type_id, self._version, self._size = unpack_from(">LLL", self._data, 0)
400512

401513
# TODO: figure out whether we should better keep the lock, or maybe
402514
# add a .keep file instead ?
403515
else: # must be '_size' or '_version'
404516
# read header info - we do that just with a file stream
405-
type_id, self._version, self._size = unpack(">4sLL", open(self._packpath).read(12))
517+
type_id, self._version, self._size = unpack(">LLL", open(self._packpath).read(12))
406518
# END handle header
407519

520+
if type_id != self.pack_signature:
521+
raise ParseError("Invalid pack signature: %i" % type_id)
522+
#END assert type id
523+
408524
def _iter_objects(self, start_offset, as_stream=True):
409525
"""Handle the actual iteration of objects within this pack"""
410526
data = self._data
@@ -759,7 +875,8 @@ def collect_streams(self, sha):
759875

760876

761877
@classmethod
762-
def create(cls, object_iter, pack_write, index_write=None):
878+
def write_pack(cls, object_iter, pack_write, index_write=None,
879+
object_count = None, zlib_compression = zlib.Z_BEST_SPEED):
763880
"""
764881
Create a new pack by putting all objects obtained by the object_iterator
765882
into a pack which is written using the pack_write method.
@@ -769,9 +886,74 @@ def create(cls, object_iter, pack_write, index_write=None):
769886
:param pack_write: function to receive strings to write into the pack stream
770887
:param indx_write: if not None, the function writes the index file corresponding
771888
to the pack.
889+
:param object_count: if you can provide the amount of objects in your iteration,
890+
this would be the place to put it. Otherwise we have to pre-iterate and store
891+
all items into a list to get the number, which uses more memory than necessary.
892+
:param zlib_compression: the zlib compression level to use
893+
:return: binary sha over all the contents of the pack
772894
:note: The destination of the write functions is up to the user. It could
773-
be a socket, or a file for instance"""
895+
be a socket, or a file for instance
896+
:note: writes only undeltified objects"""
897+
objs = object_iter
898+
if not object_count:
899+
if not isinstance(object_iter, (tuple, list)):
900+
objs = list(object_iter)
901+
#END handle list type
902+
object_count = len(objs)
903+
#END handle object
904+
905+
pack_writer = FlexibleSha1Writer(pack_write)
906+
pwrite = pack_writer.write
907+
ofs = 0 # current offset into the pack file
908+
index = None
909+
wants_index = index_write is not None
910+
911+
# write header
912+
pwrite(pack('>LLL', PackFile.pack_signature, PackFile.pack_version_default, object_count))
913+
ofs += 12
914+
915+
if wants_index:
916+
index = IndexWriter()
917+
#END handle index header
918+
919+
actual_count = 0
920+
for obj in objs:
921+
actual_count += 1
922+
923+
# object header
924+
hdr = create_pack_object_header(obj.type_id, obj.size)
925+
pwrite(hdr)
926+
927+
# data stream
928+
zstream = zlib.compressobj(zlib_compression)
929+
ostream = obj.stream
930+
br, bw, crc = write_stream_to_pack(ostream.read, pwrite, zstream, want_crc = index_write)
931+
assert(br == obj.size)
932+
if wants_index:
933+
index.append(obj.binsha, crc, ofs)
934+
#END handle index
935+
936+
ofs += len(hdr) + bw
937+
if actual_count == object_count:
938+
break
939+
#END abort once we are done
940+
#END for each object
941+
942+
if actual_count != object_count:
943+
raise ValueError("Expected to write %i objects into pack, but received only %i from iterators" % (object_count, actual_count))
944+
#END count assertion
945+
946+
# write footer
947+
binsha = pack_writer.sha(as_hex = False)
948+
assert len(binsha) == 20
949+
pack_write(binsha)
950+
ofs += len(binsha) # just for completeness ;)
951+
952+
if wants_index:
953+
index.write(binsha, index_write)
954+
#END handle index
774955

956+
return binsha
775957

776958

777959
#} END interface

gitdb/stream.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@
3333
except ImportError:
3434
pass
3535

36-
__all__ = ('DecompressMemMapReader', 'FDCompressedSha1Writer', 'DeltaApplyReader')
36+
__all__ = ( 'DecompressMemMapReader', 'FDCompressedSha1Writer', 'DeltaApplyReader',
37+
'Sha1Writer', 'FlexibleSha1Writer', 'ZippedStoreShaWriter', 'FDCompressedSha1Writer',
38+
'FDStream', 'NullStream')
3739

3840

3941
#{ RO Streams
@@ -557,6 +559,20 @@ def sha(self, as_hex = False):
557559
#} END interface
558560

559561

562+
class FlexibleSha1Writer(Sha1Writer):
563+
"""Writer producing a sha1 while passing on the written bytes to the given
564+
write function"""
565+
__slots__ = 'writer'
566+
567+
def __init__(self, writer):
568+
Sha1Writer.__init__(self)
569+
self.writer = writer
570+
571+
def write(self, data):
572+
Sha1Writer.write(self, data)
573+
self.writer(data)
574+
575+
560576
class ZippedStoreShaWriter(Sha1Writer):
561577
"""Remembers everything someone writes to it and generates a sha"""
562578
__slots__ = ('buf', 'zip')

0 commit comments

Comments
 (0)