forked from Kitware/VTK
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvtkTextExtraction.h
120 lines (101 loc) · 4.26 KB
/
vtkTextExtraction.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/*=========================================================================
Program: Visualization Toolkit
Module: vtkTextExtraction.h
Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen
All rights reserved.
See Copyright.txt or http://www.kitware.com/Copyright.htm for details.
This software is distributed WITHOUT ANY WARRANTY; without even
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the above copyright notice for more information.
=========================================================================*/
/*-------------------------------------------------------------------------
Copyright 2008 Sandia Corporation.
Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
the U.S. Government retains certain rights in this software.
-------------------------------------------------------------------------*/
// .NAME vtkTextExtraction - Extracts text from documents based on their MIME type.
//
// .SECTION Description
// Given a table containing document ids, URIs, Mime types and document contents,
// extracts plain text from each document, and generates a list of 'tags' that
// delineate ranges of text. The actual work of extracting text and generating tags
// is performed by an ordered list of vtkTextExtractionStrategy objects.
//
// By default, vtkTextExtraction has just a single strategy for extracting plain
// text documents. Callers will almost certainly want to supplement or replace
// the default with their own strategies.
//
// Inputs:
// Input port 0: (required) A vtkTable containing document ids, Mime types and
// document contents (which could be binary).
//
// Outputs:
// Output port 0: The same table with an additional "text" column that contains the
// text extracted from each document.
// Output port 1: A table of document tags that includes "document", "uri", "begin",
// "end", and "type" columns.
//
// Use SetInputArrayToProcess(0, ...) to specify the input table column that contains
// document ids (must be a vtkIdTypeArray). Default: "document".
//
// Use SetInputArrayToProcess(1, ...) to specify the input table column that contains
// URIs (must be a vtkStringArray). Default: "uri".
//
// Use SetInputArrayToProcess(2, ...) to specify the input table column that contains
// Mime types (must be a vtkStringArray). Default: "mime_type".
//
// Use SetInputArrayToProcess(3, ...) to specify the input table column that contains
// document contents (must be a vtkStringArray). Default: "content".
//
// .SECTION Caveats
// The input document contents array must be a string array, even though the individual
// document contents may be binary data.
//
// .SECTION See Also
// vtkTextExtractionStrategy, vtkPlainTextExtractionStrategy
//
// .SECTION Thanks
// Developed by Timothy M. Shead ([email protected]) at Sandia National Laboratories.
#ifndef __vtkTextExtraction_h
#define __vtkTextExtraction_h
#include <vtkTableAlgorithm.h>
class vtkTextExtractionStrategy;
class VTK_TEXT_ANALYSIS_EXPORT vtkTextExtraction :
public vtkTableAlgorithm
{
public:
static vtkTextExtraction* New();
vtkTypeMacro(vtkTextExtraction, vtkTableAlgorithm);
void PrintSelf(ostream& os, vtkIndent indent);
// Description:
// Clear the list of strategies.
void ClearStrategies();
// Description:
// Prepend a strategy to the list of strategies. vtkTextExtraction assumes ownership
// of the supplied object.
void PrependStrategy(vtkTextExtractionStrategy* strategy);
// Description:
// Append a strategy to the list of strategies. vtkTextExtraction assumes ownership
// of the supplied object.
void AppendStrategy(vtkTextExtractionStrategy* strategy);
// Description:
// Specifies the name of the output text array. Default: "text".
vtkSetStringMacro(OutputArray);
vtkGetStringMacro(OutputArray);
//BTX
protected:
vtkTextExtraction();
~vtkTextExtraction();
virtual int RequestData(
vtkInformation* request,
vtkInformationVector** inputVector,
vtkInformationVector* outputVector);
private:
vtkTextExtraction(const vtkTextExtraction &); // Not implemented.
void operator=(const vtkTextExtraction &); // Not implemented.
char* OutputArray;
class Implementation;
Implementation* const Internal;
//ETX
};
#endif // __vtkTextExtraction_h