forked from marbl/canu
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generalizedUnaryEncoding.h
145 lines (118 loc) · 4.22 KB
/
generalizedUnaryEncoding.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
/******************************************************************************
*
* This file is part of canu, a software program that assembles whole-genome
* sequencing reads into contigs.
*
* This software is based on:
* 'Celera Assembler' (http://wgs-assembler.sourceforge.net)
* the 'kmer package' (http://kmer.sourceforge.net)
* both originally distributed by Applera Corporation under the GNU General
* Public License, version 2.
*
* Canu branched from Celera Assembler at its revision 4587.
* Canu branched from the kmer project at its revision 1994.
*
* Modifications by:
*
* Brian P. Walenz on 2004-APR-27
* are Copyright 2004 Brian P. Walenz, and
* are subject to the GNU General Public License version 2
*
* Brian P. Walenz from 2007-JAN-02 to 2014-APR-11
* are Copyright 2007,2014 J. Craig Venter Institute, and
* are subject to the GNU General Public License version 2
*
* File 'README.licenses' in the root directory of this distribution contains
* full conditions and disclaimers for each license.
*/
#ifndef GENERALIZED_UNARY_ENCODING_H
#define GENERALIZED_UNARY_ENCODING_H
#include "bitPacking.h"
// Lots and lots of semi-useless debugging information
//#define DEBUG_GENERALIZEDUNARYENCODING
// Generalized unary encodings. Defined by (start, step, stop).
// This implementation uses stop=infinity to encode all possible
// numbers. If you know the highest number possible, you'll get a
// slight decrease in space used ...
// The method:
//
// The mth code word consists of 'm' unary encoded, followed by w =
// start + m * step binary encoded bits. If a == stop, then the
// terminator in the unary code is dropped.
//
// Encoding is tricky. Take the 3,2,9 example:
// m w template # vals #'s
// 0 3 1xxx 8 0- 7
// 1 5 01xxxxx 32 8- 39
// 2 7 001xxxxxxx 128 40-167
// 3 9 000xxxxxxxxx 512 168-679
//
// I don't see a nice way of mapping our number n to the prefix m,
// short of some sort of search. The implementation below is
// probably very slow.
//
// On the bright side, decoding is trivial. Read back the unary
// encoded number, then read that many bits to get the value.
//
static const uint64 _genunary_start = 3;
static const uint64 _genunary_step = 2;
//static const uint64 _genunary_stop = ~uint64ZERO;
inline
void
setGeneralizedUnaryEncodedNumber(uint64 *ptr,
uint64 pos,
uint64 *siz,
uint64 val) {
uint64 m = uint64ZERO;
uint64 w = _genunary_start;
uint64 n = uint64ONE << w;
// Search for the prefix m, given our number 'val'.
// While doing this, we get rid of all the implicitly stored values from 'val'.
//
#ifdef DEBUG_GENERALIZEDUNARYENCODING
fprintf(stderr, " val="uint64FMT" try n="uint64FMT" for m="uint64FMT"\n", val, n, m);
#endif
while (n <= val) {
val -= n;
w += _genunary_step;
n = uint64ONE << w;
m++;
#ifdef DEBUG_GENERALIZEDUNARYENCODING
fprintf(stderr, " val="uint64FMT" try n="uint64FMT" for m="uint64FMT"\n", val, n, m);
#endif
}
#ifdef DEBUG_GENERALIZEDUNARYENCODING
fprintf(stderr, "val="uint64FMT" found m="uint64FMT"\n", val, m);
#endif
// Now just encode the number
// m - the unary encoded prefix
// w - the size of the binary encoded number
setUnaryEncodedNumber(ptr, pos, siz, m);
setDecodedValue(ptr, pos+*siz, w, val);
*siz = m + 1 + w;
}
inline
uint64
getGeneralizedUnaryEncodedNumber(uint64 *ptr,
uint64 pos,
uint64 *siz) {
uint64 val = uint64ZERO;
uint64 m = uint64ZERO;
uint64 w = uint64ZERO;
// Comments in the encoder apply here too.
m = getUnaryEncodedNumber(ptr, pos, siz);
w = _genunary_start + m * _genunary_step;
val = getDecodedValue(ptr, pos + *siz, w);
*siz = m + 1 + w;
#ifdef DEBUG_GENERALIZEDUNARYENCODING
fprintf(stderr, "m="uint64FMT" w="uint64FMT" val="uint64FMT"\n", m, w, val);
#endif
// Add in the implcitly stored pieces of the number
//
while (m--) {
w -= _genunary_step;
val += uint64ONE << w;
}
return(val);
}
#endif // GENERALIZED_UNARY_ENCODING_H