forked from modesty/pdf2json
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdfparser.js
143 lines (113 loc) · 4.13 KB
/
pdfparser.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
var nodeUtil = require("util"),
nodeEvents = require("events"),
_ = require("underscore"),
fs = require('fs'),
PDFJS = require("./lib/pdf.js"),
async = require("async");
nodeUtil._logN = function logWithClassName(msg) { nodeUtil.log(this.get_name() + " - " + msg);};
nodeUtil._backTrace = function logCallStack() {
try {
throw new Error();
} catch (e) {
var msg = e.stack ? e.stack.split('\n').slice(2).join('\n') : '';
nodeUtil.log(msg);
}
};
var PDFParser = (function () {
'use strict';
// private static
var _nextId = 1;
var _name = 'PDFParser';
var _binBuffer = {};
var _maxBinBufferCount = 10;
// constructor
var cls = function (context) {
//call constructor for super class
nodeEvents.EventEmitter.call(this);
// private
var _id = _nextId++;
// public (every instance will have their own copy of these methods, needs to be lightweight)
this.get_id = function() { return _id; };
this.get_name = function() { return _name + _id; };
this.context = context;
this.pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started
this.data = null; //if file read success, data is PDF content; if failed, data is "err" object
this.PDFJS = new PDFJS();
this.parsePropCount = 0;
};
// inherit from event emitter
nodeUtil.inherits(cls, nodeEvents.EventEmitter);
// public static
cls.get_nextId = function () {
return _name + _nextId;
};
//private methods, needs to invoked by [funcName].call(this, ...)
var _onPDFJSParseDataReady = function(data) {
_.extend(this.data, data);
this.parsePropCount++;
if (this.parsePropCount >= 2) {
this.emit("pdfParser_dataReady", this);
nodeUtil._logN.call(this, "PDF parsing completed.");
}
};
var startPasringPDF = function() {
this.data = {};
this.parsePropCount = 0;
this.PDFJS.on("pdfjs_parseDataReady", _.bind(_onPDFJSParseDataReady, this));
this.PDFJS.parsePDFData(_binBuffer[this.pdfFilePath]);
};
var processBinaryCache = function() {
if (_.has(_binBuffer, this.pdfFilePath)) {
startPasringPDF.call(this);
return true;
}
var allKeys = _.keys(_binBuffer);
if (allKeys.length > _maxBinBufferCount) {
var idx = this.get_id() % _maxBinBufferCount;
var key = allKeys[idx];
_binBuffer[key] = null;
delete _binBuffer[key];
nodeUtil._logN.call(this, "re-cycled cache for " + key);
}
return false;
};
var processPDFContent = function(err, data) {
nodeUtil._logN.call(this, "Load PDF file status:" + (!!err ? "Error!" : "Success!") );
if (err) {
this.data = err;
this.emit("pdfParser_dataError", this);
}
else {
_binBuffer[this.pdfFilePath] = data;
startPasringPDF.call(this);
}
};
var fq = async.queue(function (task, callback) {
fs.readFile(task.path, callback);
}, 250);
// public (every instance will share the same method, but has no access to private fields defined in constructor)
cls.prototype.loadPDF = function (pdfFilePath) {
var self = this;
self.pdfFilePath = pdfFilePath;
nodeUtil._logN.call(this, " is about to load PDF file " + pdfFilePath);
if (processBinaryCache.call(this))
return;
// fs.readFile(pdfFilePath, _.bind(processPDFContent, self));
fq.push({path: pdfFilePath}, _.bind(processPDFContent, self));
};
cls.prototype.destroy = function() {
this.removeAllListeners();
//context object will be set in Web Service project, but not in command line utility
if (this.context) {
this.context.destroy();
this.context = null;
}
this.pdfFilePath = null;
this.data = null;
this.PDFJS.destroy();
this.PDFJS = null;
this.parsePropCount = 0;
};
return cls;
})();
module.exports = PDFParser;