-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathfeature_info.i
373 lines (273 loc) · 11.6 KB
/
feature_info.i
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
/* feature_set.i -*- C++ -*-
Jeremy Barnes, 24 September 2009
Copyright (c) 2009 Jeremy Barnes. All rights reserved.
SWIG wrapper for the Feature_Set class.
*/
%module jml
%{
#include "jml/boosting/feature_info.h"
%}
%include "std_string.i"
%include "boost_shared_ptr.i"
%template(svector) std::vector<std::string>;
namespace ML {
class Feature_Space;
class Mutable_Feature_Space;
class Training_Data;
class Categorical_Info;
class Mutable_Categorical_Info;
/*****************************************************************************/
/* CATEGORICAL_INFO */
/*****************************************************************************/
/** This structure tells us how we encode and decode categorical features.
It is split off from the main Feature_Info structure.
*/
struct Categorical_Info {
virtual ~Categorical_Info() {}
/** Print the entire set of categories. */
virtual std::string print() const = 0;
/** Print the given value. */
virtual std::string print(int value) const = 0;
/** Parse the given value, returning -1 if not found. */
virtual int lookup(const std::string & value) const = 0;
/** Parse the given value, throwing an exception if not found. Default
is implemented in terms of lookup. */
virtual unsigned parse(const std::string & value) const;
/** Return the number of possible categories. */
virtual unsigned count() const = 0;
/** Serialize to a store. */
virtual void serialize(DB::Store_Writer & store) const = 0;
/** Reconstitute from a store. */
virtual void reconstitute(DB::Store_Reader & store) = 0;
/** Return the name of this class (for serialization). */
virtual std::string class_id() const = 0;
/** Freeze it so that it can no longer grow. */
virtual void freeze() = 0;
/** Serialize in such a manner to allow polymorphic reconstitution. */
static void poly_serialize(DB::Store_Writer & store,
const Categorical_Info & info);
/** Reconstitute polymorphically a Categorical_Info from a store. */
static boost::shared_ptr<Categorical_Info>
poly_reconstitute(DB::Store_Reader & store);
%extend {
std::string __str__() const
{
return $self->print();
}
std::string __repr__() const
{
return $self->print();
}
}
};
} // namespace ML
SWIG_SHARED_PTR(ML::Categorical_Info_Ptr, ML::Categorical_Info);
namespace ML {
/*****************************************************************************/
/* FEATURE_TYPE */
/*****************************************************************************/
/** Encodes the type of the feature, which in turn encodes how the
learning algorithms attempt to learn rules for the algorithm.
*/
enum Feature_Type {
UNKNOWN, ///< we have not yet determined the feature type
PRESENCE, ///< feature is present or not present; value unimportant
BOOLEAN, ///< feature is true (1.0) or false (0.0)
CATEGORICAL, ///< feature is categorical; ordering makes no sense
REAL, ///< feature is real valued
UNUSED1, ///< Was PROB
INUTILE, ///< feature is inutile and should be ignored
STRING ///< feature is an open categorical feature
};
/*****************************************************************************/
/* FEATURE_INFO */
/*****************************************************************************/
/** This class provides information on a single feature. This is the minimum
amount that the algorithms need to do their job.
*/
struct Feature_Info {
public:
/** Initialise for one of the non-categorical types. */
Feature_Info(Feature_Type type = REAL, bool optional = false, bool biased = false,
bool grouping = false);
/** Initialise for a categorical feature info. */
Feature_Info(boost::shared_ptr<const Categorical_Info> categorical,
bool optional = false, bool biased = false,
Feature_Type type = CATEGORICAL, bool grouping = false);
void serialize(DB::Store_Writer & store) const;
void reconstitute(DB::Store_Reader & store);
/** Allow testing for equality. @{ */
bool operator == (const Feature_Info & other) const;
bool operator != (const Feature_Info & other) const;
//@}
/** Print in in ASCII format. This can be parsed later. */
std::string print() const;
/** Return the number of distinct values for this feature. Returns
0 for real features (which take an infinite number of values). */
size_t value_count() const;
Feature_Type type() const { return (Feature_Type)type_; }
boost::shared_ptr<const Categorical_Info> categorical() const
{
return categorical_;
}
/** If true, then nothing should be inferred from the absence of this
feature from a dataset. */
bool optional() const { return optional_; }
/** If true, then this feature is biased (contains some outside information
about the thing being measured, for example the label variable) and
should not be learned from. */
bool biased() const { return biased_; }
/** If true, this feature is used to group parts of datasets together, and
it can be adjusted so that it will be strictly increasing over the
dataset. */
bool grouping() const { return grouping_; }
%extend {
std::string __str__() const
{
return $self->print();
}
std::string __repr__() const
{
return $self->print();
}
}
};
std::string print(Feature_Type type);
extern const Feature_Info MISSING_FEATURE_INFO;
/** Guess the feature type, based upon its training data. */
Feature_Info
guess_info(const Training_Data & data,
const Feature & feat,
const Feature_Info & before = UNKNOWN);
/** Return the most inclusive of the two feature info values. Used when two
have been automatically detected over different datasets, to get the
real (combined) feature info.
*/
Feature_Info promote(const Feature_Info & i1, const Feature_Info & i2);
/*****************************************************************************/
/* FIXED_CATEGORICAL_INFO */
/*****************************************************************************/
struct Fixed_Categorical_Info : public Categorical_Info {
public:
/** Default construct. For when we will reconstitute after. */
Fixed_Categorical_Info();
/** Construct a bogus list of the given length. */
Fixed_Categorical_Info(unsigned num);
/** Construct from a list of names. */
Fixed_Categorical_Info(const std::vector<std::string> & names);
/** Reconstitute from a store. */
Fixed_Categorical_Info(DB::Store_Reader & store);
virtual ~Fixed_Categorical_Info();
virtual std::string print() const;
virtual std::string print(int value) const;
virtual int lookup(const std::string & value) const;
virtual unsigned count() const;
virtual void serialize(DB::Store_Writer & store) const;
virtual void reconstitute(DB::Store_Reader & store);
virtual std::string class_id() const;
virtual void freeze();
%extend {
std::string __str__() const
{
return $self->print();
}
std::string __repr__() const
{
return $self->print();
}
}
};
/*****************************************************************************/
/* MUTABLE_CATEGORICAL_INFO */
/*****************************************************************************/
struct Mutable_Categorical_Info : public Fixed_Categorical_Info {
public:
/** Construct an empty list */
Mutable_Categorical_Info();
/** Construct from a list of names. */
Mutable_Categorical_Info(const std::vector<std::string> & names);
/** Construct a bogus list of names. */
Mutable_Categorical_Info(unsigned num);
/** Reconstitute from a store. */
Mutable_Categorical_Info(DB::Store_Reader & store);
/** Copy another Categorical_Info object */
Mutable_Categorical_Info(const Categorical_Info & other);
/** Either parse (if it is already there) or add (if not) the given name
to the internal structures. */
int parse_or_add(const std::string & name) const;
virtual int lookup(const std::string & value) const;
virtual void freeze();
bool frozen;
%extend {
std::string __str__() const
{
return $self->print();
}
std::string __repr__() const
{
return $self->print();
}
}
};
} // namespace ML
SWIG_SHARED_PTR_DERIVED(Fixed_Categorical_Info_Ptr, ML::Categorical_Info, ML::Fixed_Categorical_Info)
SWIG_SHARED_PTR_DERIVED(Mutable_Categorical_Info_Ptr, ML::Categorical_Info, ML::Mutable_Categorical_Info)
namespace ML {
/*****************************************************************************/
/* MUTABLE_FEATURE_INFO */
/*****************************************************************************/
/** Same as Feature_Info, but mutable. */
struct Mutable_Feature_Info : public Feature_Info {
/** Initalize from a Feature_Info object. */
Mutable_Feature_Info(const Feature_Info & info);
/** Initialise for one of the non-categorical types. */
Mutable_Feature_Info(Feature_Type type = REAL, bool optional = false);
/** Initialise for a categorical feature info. */
Mutable_Feature_Info(boost::shared_ptr<Mutable_Categorical_Info> categorical,
bool optional = false,
Feature_Type type = CATEGORICAL /* or STRING */);
void reconstitute(DB::Store_Reader & store);
/** Turn a non-categorical feature info into a categorical one. */
void make_categorical(Feature_Type type = CATEGORICAL);
/** Set the categorical info. */
void set_categorical(boost::shared_ptr<Mutable_Categorical_Info> info,
Feature_Type type = CATEGORICAL);
/** Set the categorical info. */
void set_categorical(Mutable_Categorical_Info * info,
Feature_Type type = CATEGORICAL);
boost::shared_ptr<Mutable_Categorical_Info> mutable_categorical() const
{
if (categorical_ != mutable_categorical_)
throw Exception("Mutable_Feature_Info::categorical(): out of sync");
return mutable_categorical_;
}
/** Set the feature type. */
void set_type(Feature_Type type);
/** Set the optional flag. */
void set_optional(bool optional);
/** Set the biased flag. */
void set_biased(bool biased);
/** Set the grouping flag. */
void set_grouping(bool grouping);
/** Parse from a text file. */
void parse(Parse_Context & context);
/** Stop it from growing. */
void freeze();
%extend {
std::string __str__() const
{
return $self->print();
}
std::string __repr__() const
{
return $self->print();
}
}
};
/** Guess the feature info for each of the features, and modify the
given feature space to reflect this. Requires that finish() has
already been called.
*/
void guess_all_info(const Training_Data & data,
Mutable_Feature_Space & fs, bool use_existing);
} // namespace ML