forked from TheDataShed/xlsxreader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrows.go
392 lines (339 loc) · 8.58 KB
/
rows.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
package xlsxreader
import (
"encoding/xml"
"fmt"
"strconv"
"strings"
)
// rawRow represent the raw XML element for parsing a row of data.
type rawRow struct {
Index int `xml:"r,attr,omitempty"`
RawCells []rawCell `xml:"c"`
}
func (rr *rawRow) unmarshalXML(d *xml.Decoder, start xml.StartElement) error {
for _, attr := range start.Attr {
if attr.Name.Local != "r" {
continue
}
var err error
if rr.Index, err = strconv.Atoi(attr.Value); err != nil {
return fmt.Errorf("unable to parse row index: %w", err)
}
}
for {
tok, err := d.Token()
if err != nil {
return fmt.Errorf("error retrieving xml token: %w", err)
}
var se xml.StartElement
switch el := tok.(type) {
case xml.StartElement:
se = el
case xml.EndElement:
if el == start.End() {
return nil
}
default:
continue
}
if se.Name.Local != "c" {
continue
}
var rc rawCell
if err = rc.unmarshalXML(d, se); err != nil {
return fmt.Errorf("unable to unmarshal cell: %w", err)
}
rr.RawCells = append(rr.RawCells, rc)
}
}
// rawCell represents the raw XML element for parsing a cell.
type rawCell struct {
Reference string `xml:"r,attr"` // E.g. A1
Type string `xml:"t,attr,omitempty"`
Value *string `xml:"v,omitempty"`
Style int `xml:"s,attr"`
InlineString *string `xml:"is>t"`
}
func (rc *rawCell) unmarshalXML(d *xml.Decoder, start xml.StartElement) error {
// unmarshal attributes
for _, attr := range start.Attr {
switch attr.Name.Local {
case "r":
rc.Reference = attr.Value
case "t":
rc.Type = attr.Value
case "s":
var err error
if rc.Style, err = strconv.Atoi(attr.Value); err != nil {
return err
}
}
}
for {
tok, err := d.Token()
if err != nil {
return fmt.Errorf("error retrieving xml token: %w", err)
}
var se xml.StartElement
switch el := tok.(type) {
case xml.StartElement:
se = el
case xml.EndElement:
if el == start.End() {
return nil
}
continue
default:
continue
}
switch se.Name.Local {
case "is":
err = rc.unmarshalInlineString(d, se)
case "v":
var v string
if v, err = getCharData(d); err != nil {
return err
}
rc.Value = &v
default:
continue
}
if err != nil {
return fmt.Errorf("unable to parse cell data: %w", err)
}
}
}
func (rc *rawCell) unmarshalInlineString(d *xml.Decoder, start xml.StartElement) error {
for {
tok, err := d.Token()
if err != nil {
return fmt.Errorf("error retrieving xml token: %w", err)
}
var se xml.StartElement
switch el := tok.(type) {
case xml.StartElement:
se = el
case xml.EndElement:
if el == start.End() {
return nil
}
continue
default:
continue
}
if se.Name.Local != "t" {
continue
}
v, err := getCharData(d)
if err != nil {
return fmt.Errorf("unable to parse string: %w", err)
}
rc.InlineString = &v
return nil
}
}
// Row represents a row of data read from an Xlsx file, in a consumable format
type Row struct {
Error error
Index int
Cells []Cell
}
// Cell represents the data in a single cell as a consumable format.
type Cell struct {
Column string // E.G A, B, C
Row int
Value string
Type CellType
}
// CellType defines the data type of an excel cell
type CellType string
const (
// TypeString is for text cells
TypeString CellType = "string"
// TypeNumerical is for numerical values
TypeNumerical CellType = "numerical"
// TypeDateTime is for date values
TypeDateTime CellType = "datetime"
// TypeBoolean is for true/false values
TypeBoolean CellType = "boolean"
)
// ColumnIndex gives a number, representing the column the cell lies beneath.
func (c Cell) ColumnIndex() int {
return asIndex(c.Column)
}
// getCellValue interrogates a raw cell to get a textual representation of the cell's contents.
// Numerical values are returned in their string format.
// Dates are returned as an ISO YYYY-MM-DD formatted string.
// Datetimes are returned in RFC3339 (ISO-8601) YYYY-MM-DDTHH:MM:SSZ formatted string.
func (x *XlsxFile) getCellValue(r rawCell) (string, error) {
if r.Type == "inlineStr" {
if r.InlineString == nil {
return "", fmt.Errorf("cell had type of InlineString, but the InlineString attribute was missing")
}
return *r.InlineString, nil
}
if r.Value == nil {
return "", fmt.Errorf("unable to get cell value for cell %s - no value element found", r.Reference)
}
if r.Type == "s" {
index, err := strconv.Atoi(*r.Value)
if err != nil {
return "", err
}
if len(x.sharedStrings) <= index {
return "", fmt.Errorf("attempted to index value %d in shared strings of length %d",
index, len(x.sharedStrings))
}
return x.sharedStrings[index], nil
}
if x.dateStyles[r.Style] && r.Type != "d" {
formattedDate, err := convertExcelDateToDateString(*r.Value)
if err != nil {
return "", err
}
return formattedDate, nil
}
return *r.Value, nil
}
func (x *XlsxFile) getCellType(r rawCell) CellType {
if x.dateStyles[r.Style] {
return TypeDateTime
}
switch r.Type {
case "b":
return TypeBoolean
case "d":
return TypeDateTime
case "n", "":
return TypeNumerical
case "s", "inlineStr":
return TypeString
default:
return TypeString
}
}
// readSheetRows iterates over "row" elements within a worksheet,
// pushing a parsed Row struct into a channel for each one.
func (x *XlsxFile) readSheetRows(sheet string, ch chan<- Row) {
defer close(ch)
file, ok := x.sheetFiles[sheet]
if !ok {
ch <- Row{
Error: fmt.Errorf("unable to open sheet %s", sheet),
}
return
}
xmlFile, err := file.Open()
if err != nil {
ch <- Row{
Error: err,
}
return
}
defer xmlFile.Close()
decoder := xml.NewDecoder(xmlFile)
for {
token, _ := decoder.Token()
if token == nil {
return
}
switch startElement := token.(type) {
case xml.StartElement:
if startElement.Name.Local == "row" {
row := x.parseRow(decoder, &startElement)
if len(row.Cells) < 1 && row.Error == nil {
continue
}
ch <- row
}
}
}
}
// parseRow parses the raw XML of a row element into a consumable Row struct.
// The Row struct returned will contain any errors that occurred either in
// interrogating values, or in parsing the XML.
func (x *XlsxFile) parseRow(decoder *xml.Decoder, startElement *xml.StartElement) Row {
var r rawRow
err := r.unmarshalXML(decoder, *startElement)
if err != nil {
return Row{
Error: err,
Index: r.Index,
}
}
cells, err := x.parseRawCells(r.RawCells, r.Index)
if err != nil {
return Row{
Error: err,
Index: r.Index,
}
}
return Row{
Cells: cells,
Index: r.Index,
}
}
// parseRawCells converts a slice of structs containing a raw representation of the XML into
// a standardised slice of Cell structs. An error will be returned if it is not possible
// to interpret the value of any of the cells.
func (x *XlsxFile) parseRawCells(rawCells []rawCell, index int) ([]Cell, error) {
cells := []Cell{}
for _, rawCell := range rawCells {
if rawCell.Value == nil && rawCell.InlineString == nil {
// This cell is empty, so ignore it
continue
}
column := strings.Map(removeNonAlpha, rawCell.Reference)
val, err := x.getCellValue(rawCell)
if err != nil {
return nil, err
}
cells = append(cells, Cell{
Column: column,
Row: index,
Value: val,
Type: x.getCellType(rawCell),
})
}
return cells, nil
}
// ReadRows provides an interface allowing rows from a specific worksheet to be streamed
// from an xlsx file.
// In order to provide a simplistic interface, this method returns a channel that can be
// range-d over.
//
// This method has one notable drawback however - the entire file must be consumed before
// the channel will be closed. Reading only some of the values will leave an orphaned
// goroutine and channel behind.
//
// Notes:
// Xlsx sheets may omit cells which are empty, meaning a row may not have continuous cell
// references. This function makes no attempt to fill/pad the missing cells.
func (x *XlsxFile) ReadRows(sheet string) chan Row {
rowChannel := make(chan Row)
go x.readSheetRows(sheet, rowChannel)
return rowChannel
}
// removeNonAlpha is used in combination with strings.Map to remove any non alpha-numeric
// characters from a cell reference, returning just the column name in a consistent uppercase format.
// For example, a11 -> A, AA1 -> AA
func removeNonAlpha(r rune) rune {
if 'A' <= r && r <= 'Z' {
return r
}
if 'a' <= r && r <= 'z' {
// make it uppercase
return r - 32
}
// drop the rune
return -1
}
// cell name to cell index. 'A' -> 0, 'Z' -> 25, 'AA' -> 26
func asIndex(s string) int {
index := 0
for _, c := range s {
index *= 26
index += int(c) - 'A' + 1
}
return index - 1
}