Skip to content

Commit

Permalink
Added OpenZIM (untested, likely wrong)
Browse files Browse the repository at this point in the history
  • Loading branch information
KOLANICH committed Jul 27, 2020
1 parent ad875ff commit 285239b
Showing 1 changed file with 356 additions and 0 deletions.
356 changes: 356 additions & 0 deletions media/open_zim.ksy
Original file line number Diff line number Diff line change
@@ -0,0 +1,356 @@
meta:
id: open_zim
title: "(Open) Zeno IMproved"
application:
- Kiwix
- zimlib
file-extension: zim
xref:
wikidata: Q784695
license: CC-BY-SA-3.0
encoding: utf-8
endian: le

doc: |
A file format to store encyclopaedias of articles written in MediaWiki markup language.
Files for test: https://dumps.wikimedia.org/other/kiwix/zim/wikipedia/
doc-ref:
- https://www.openzim.org/wiki/ZIM_file_format
- https://wiki.openzim.org/wiki/OpenZIM

seq:
- id: signature
-orig-id: magicNumber
contents: [0x5A, 0x49, 0x4D, 0x04]
doc: Magic number to recognise the file format, must be
- id: version
type: version
doc: Version of the ZIM file format
- id: uuid
type: uuid
doc: unique id of this zim file
- id: article_count
type: u4
doc: total number of articles
- id: cluster_count
type: u4
doc: total number of clusters
- id: url_pointer_list_ptr
-orig-id: urlPtrPos
type: u8
doc: position of the directory pointerlist ordered by URL
- id: title_pointer_list_ptr
-orig-id: titlePtrPos
type: u8
doc: position of the directory pointerlist ordered by Title
- id: cluster_pointer_list_ptr
-orig-id: clusterPtrPos
type: u8
doc: position of the cluster pointer list
- id: mime_list_ptr
-orig-id: mimeListPos
type: u8
doc: |
position of the MIME type list (also header size)
The MIME type list always follows directly after the header, so the mimeListPos also defines the end and size of the ZIM file header.
The MIME types in this list are zero terminated strings.
- id: main_page_idx
-orig-id: mainPage
type: u4
doc: main page or 0xffffffff if no main page
- id: layout_page_idx
-orig-id: layoutPage
type: u4
doc: layout page or 0xffffffffff if no layout page
- id: md5_ptr
-orig-id: checksumPos
type: u8
doc: pointer to the md5 checksum of this file without the checksum itself. This points always 16 bytes before the end of the file.
instances:
mime_list:
pos: mime_list_ptr
type: str_list
url_pointer_list:
doc: |
The URL pointer list is a list of 8 byte offsets to the directory entries.
The directory entries are always ordered by URL. Ordering is simply done by comparing the URL strings.
Since directory entries have variable sizes this is needed for random access.
Zimlib caches directory entries and references the cached entries via the URL pointers.
pos: url_pointer_list_ptr
type: directory_entry_ptr
repeat: expr
repeat-expr: article_count
title_pointer_list:
doc: |
The title pointer list is a list of article indices ordered by title. The title pointer list actually points to entries in the URL pointer list. Note that the title pointers are only 4 bytes. They are not offsets in the file but article numbers. To get the offset of an article from the title pointer list, you have to look it up in the URL pointer list.
The indirection from titles via URLs to directory entries has two reasons: the pointer list is only half in size as 4 bytes are enough for each entry accessing directory entries by title also makes use of cached directory entries which are referenced by the URL pointers, as implemented in zimlib.
pos: title_pointer_list_ptr
type: title_index
repeat: expr
repeat-expr: article_count
cluster_pointer_list:
pos: cluster_pointer_list_ptr
type: cluster_ptr
repeat: expr
repeat-expr: cluster_count
main_page:
value: url_pointer_list[main_page_idx]
if: main_page_idx != 0xffffffff
layout_page:
value: url_pointer_list[layout_page_idx]
if: layout_page_idx != 0xffffffff
md5:
pos: md5_ptr
type: md5
minimal_xz_lzma_size:
value: 32
types:
md5:
seq:
- id: data
size: 16
uuid:
seq:
- id: data
size: 16
version:
seq:
- id: major
-orig-id: majorVersion
type: u2
doc: |
Major version of the ZIM file format (5 or 6)
Major version is updated when an incompatible change is integrated in the format (a lib made for a version N will probably not be able to read a version N+1)
There are currently 2 major versions :
The version 5
The version 6 (the same that version 5 + potential extended cluster)
- id: minor
-orig-id: minorVersion
type: u2
doc: |
Minor version of the ZIM file format
Minor version is updated when an compatible change is integrated (a lib made for a minor version n will be able to read a version n+1)
title_index:
seq:
- id: index
type: u8
instances:
entry:
value: _parent.url_pointer_list[index].entry
str_list:
seq:
- id: items
type: strz
repeat: until
repeat-until: '_ == ""'
directory_entry_ptr:
seq:
- id: ptr
type: u8
instances:
entry:
pos: ptr
type: directory_entry
types:
directory_entry:
seq:
- id: mime_idx
-orig-id: mimetype
type: u2
doc: MIME type number as defined in the MIME type list
- id: parameter_size
-orig-id: parameter_len
type: u1
doc: (not used) length of extra paramters
- id: namespace
type: s1
enum: namespace
doc: defines to which namespace this directory entry belongs
- id: revision
type: u4
doc: (optional) identifies a revision of the contents of this directory entry, needed to identify updates or revisions in the original history
- id: body
type:
switch-on: mime_idx == 0xffff
cases:
true: redirect
false: article
- id: url
type: strz
doc: string with the URL as refered in the URL pointer list
- id: title
type: strz
doc: string with an title as refered in the Title pointer list or empty; in case it is empty, the URL is used as title
- id: parameter
size: parameter_size
doc: (not used) extra parameters
instances:
mime:
value: _root.mime_list.items[mime_idx]
if: mime_idx != 0xffff

types:
article:
seq:
- id: cluster_index
-orig-id: cluster_number
type: u4
doc: cluster number in which the data of this directory entry is stored
- id: blob_index
-orig-id: blob_number
type: u4
doc: blob number inside the compressed cluster where the contents are stored
instances:
cluster:
value: _root.cluster_pointer_list[cluster_index].cluster
data:
value: cluster.blobs[blob_index]
redirect:
seq:
- id: redirect_index
-orig-id: redirect_index
type: u4
doc: pointer to the directory entry of the redirect target
enums:
namespace:
0x2d: #'-'
id: layout
doc: eg. the LayoutPage, CSS, favicon.png (48x48), JavaScript and images not related to the articles
0x41: #'A'
id: article
doc: articles - see Article Format
0x42: #'B'
id: article_meta_data
doc: article meta data - see Article Format
0x49: #'I'
id: image_file
doc: images, files - see Image Handling
0x4A: #'J'
id: image_text
doc: images, text - see Image Handling
0x4d: #'M'
id: meta_data
doc: ZIM metadata - see Metadata
0x55: #'U'
id: category_text
doc: categories, text - see Category Handling
0x56: #'V'
id: category_article_list
doc: categories, article list - see Category Handling
0x57: #'W'
id: category_list_per_article
doc: categories per article, category list - see Category Handling
0x58: #'X'
id: fulltext_index
doc: fulltext index - see ZIM Index Format
cluster_ptr:
seq:
- id: ptr
type: u8
instances:
cluster:
pos: ptr
type: cluster(ptr)
types:
cluster:
# BUG: JUNK. Seems to be positioned to a wrong offsets. info.reserved is not 0
params:
- id: ptr
type: u8
seq:
- id: info
type: info_t
- id: blobs_ptrs_compressed
process: kaitai.compress.lzma(2)
size: info.ptr_size * blob_count_plus_one
type: blobs_ptrs(blob_count)
if: info.compression == compression::lzma2
- id: blobs_ptrs_uncompressed
type: blobs_ptrs(blob_count)
if: info.compression == compression::none or info.compression == compression::default
instances:
first_blob_offset:
value: ptr + 1 # info_t.size
blobs:
type: blob(_index)
repeat: expr
repeat-expr: blob_count
first_blob_ptr_compressed:
pos: first_blob_offset
process: kaitai.compress.lzma(2)
size: 8 + _root.minimal_xz_lzma_size # max(8, 4) + minimal_xz_lzma_size
type: blob_ptr_u
if: info.compression == compression::lzma2
first_blob_ptr_uncompressed:
pos: first_blob_offset
type: blob_ptr_u
if: info.compression == compression::none or info.compression == compression::default
first_blob_ptr_u_v:
value: (info.compression == compression::lzma2?first_blob_ptr_compressed:first_blob_ptr_uncompressed).as<blob_ptr_u>.value
blob_count_plus_one:
value: first_blob_ptr_u_v / info.ptr_size
blob_count:
value: blob_count_plus_one - 1
blobs_ptrs:
value: (info.compression == compression::lzma2?blobs_ptrs_compressed:blobs_ptrs_uncompressed).ptrs
types:
blobs_ptrs:
params:
- id: count
type: u8
seq:
- id: ptrs
type: blob_ptr_u
repeat: eos
instances:
info:
value: _parent.info
blob:
params:
- id: idx
type: u8
instances:
data:
pos: _parent.ptr + _parent.blobs_ptrs[idx].as<blob_ptr_u>.value
size: _parent.blobs_ptrs[idx+1].as<blob_ptr_u>.value - _parent.blobs_ptrs[idx].as<blob_ptr_u>.value
info_t:
#seq:
# - id: compression
# type: b4
# enum: compression
# - id: ptr_is_u8
# -orig-id: extended
# type: b1
# - id: reserved
# type: b3
seq:
- id: reserved
type: b3
- id: ptr_is_u8
-orig-id: extended
type: b1
- id: compression
type: b4
enum: compression
instances:
ptr_size:
value: (ptr_is_u8?8:4)
blob_ptr_u:
seq:
- id: value
type:
switch-on: _parent.as<cluster>.info.ptr_is_u8 # bug in KSC, as is mandatory
cases:
true: u8
false: u4
enums:
compression:
0:
id: default
-orig-id: default
doc: no compression
1: none
2: zlib # removed
3: bzip2 # removed
4: lzma2

0 comments on commit 285239b

Please sign in to comment.