forked from Rdatatable/data.table
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubset.c
347 lines (323 loc) · 16.3 KB
/
subset.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#include "data.table.h"
static void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA)
// Only for use by subsetDT() or subsetVector() below, hence static
{
const int n = length(idx);
if (length(ans)!=n) error("Internal error: subsetVectorRaw length(ans)==%d n=%d", length(ans), n);
const int *restrict idxp = INTEGER(idx);
// anyNA refers to NA _in idx_; if there's NA in the data (source) that's just regular data to be copied
// negatives, zeros and out-of-bounds have already been dealt with in convertNegAndZero so we can rely
// here on idx in range [1,length(ans)].
#define PARLOOP(_NAVAL_) \
if (anyNA) { \
_Pragma("omp parallel for num_threads(getDTthreads())") \
for (int i=0; i<n; i++) { \
int elem = idxp[i]; \
ap[i] = elem==NA_INTEGER ? _NAVAL_ : sp[elem-1]; \
} \
} else { \
_Pragma("omp parallel for num_threads(getDTthreads())") \
for (int i=0; i<n; i++) { \
ap[i] = sp[idxp[i]-1]; \
} \
}
// For small n such as 2,3,4 etc we hope OpenMP will be sensible inside it and not create a team with each thread doing just one item. Otherwise,
// call overhead would be too high for highly iterated calls on very small subests. Timings were tested in #3175
// Futher, we desire (currently at least) to stress-test the threaded code (especially in latest R-devel) on small data to reduce chance that bugs
// arise only over a threshold of n.
switch(TYPEOF(source)) {
case INTSXP: case LGLSXP: {
int *sp = INTEGER(source);
int *ap = INTEGER(ans);
PARLOOP(NA_INTEGER)
} break;
case REALSXP : {
if (INHERITS(source, char_integer64)) {
int64_t *sp = (int64_t *)REAL(source);
int64_t *ap = (int64_t *)REAL(ans);
PARLOOP(INT64_MIN)
} else {
double *sp = REAL(source);
double *ap = REAL(ans);
PARLOOP(NA_REAL)
}
} break;
case STRSXP : {
// write barrier (assigning strings/lists) is not thread safe. Hence single threaded.
// To go parallel here would need access to NODE_IS_OLDER, at least. Given gcgen, mark and named
// are upper bounded and max 3, REFCNT==REFCNTMAX could be checked first and then critical SET_ if not.
// Inside that critical just before SET_ it could check REFCNT<REFCNTMAX still held. Similarly for gcgen.
// TODO - discuss with Luke Tierney. Produce benchmarks on integer/double to see if it's worth making a safe
// API interface for package use for STRSXP.
// Aside: setkey() is a separate special case (a permutation) and does do this in parallel without using SET_*.
SEXP *sp = STRING_PTR(source);
if (anyNA) {
for (int i=0; i<n; i++) { int elem = idxp[i]; SET_STRING_ELT(ans, i, elem==NA_INTEGER ? NA_STRING : sp[elem-1]); }
} else {
for (int i=0; i<n; i++) { SET_STRING_ELT(ans, i, sp[idxp[i]-1]); }
}
} break;
case VECSXP : {
// VECTOR_PTR does exist but returns 'not safe to return vector pointer' when USE_RINTERNALS is not defined.
// VECTOR_DATA and LIST_POINTER exist too but call VECTOR_PTR. All are clearly not intended to be used by packages.
// The concern is overhead inside VECTOR_ELT() biting when called repetitively in a loop like we do here. That's why
// we take the R API (INTEGER()[i], REAL()[i], etc) outside loops for the simple types even when not parallel. For this
// type list case (VECSXP) it might be that some items are ALTREP for example, so we really should use the heavier
// _ELT accessor (VECTOR_ELT) inside the loop in this case.
if (anyNA) {
for (int i=0; i<n; i++) { int elem = idxp[i]; SET_VECTOR_ELT(ans, i, elem==NA_INTEGER ? R_NilValue : VECTOR_ELT(source, elem-1)); }
} else {
for (int i=0; i<n; i++) { SET_VECTOR_ELT(ans, i, VECTOR_ELT(source, idxp[i]-1)); }
}
} break;
case CPLXSXP : {
Rcomplex *sp = COMPLEX(source);
Rcomplex *ap = COMPLEX(ans);
Rcomplex NA_CPLX = { NA_REAL, NA_REAL };
PARLOOP(NA_CPLX)
} break;
case RAWSXP : {
Rbyte *sp = RAW(source);
Rbyte *ap = RAW(ans);
PARLOOP(0)
} break;
default :
error("Internal error: column type '%s' not supported by data.table subset. All known types are supported so please report as bug.", type2char(TYPEOF(source))); // # nocov
}
}
static const char *check_idx(SEXP idx, int max, bool *anyNA_out, bool *orderedSubset_out)
// set anyNA for branchless subsetVectorRaw
// error if any negatives, zeros or >max since they should have been dealt with by convertNegAndZeroIdx() called ealier at R level.
// single cache efficient sweep with prefetch, so very low priority to go parallel
{
if (!isInteger(idx)) error("Internal error. 'idx' is type '%s' not 'integer'", type2char(TYPEOF(idx))); // # nocov
bool anyLess=false, anyNA=false;
int last = INT32_MIN;
int *idxp = INTEGER(idx), n=LENGTH(idx);
for (int i=0; i<n; i++) {
int elem = idxp[i];
if (elem<=0 && elem!=NA_INTEGER) return "Internal inefficiency: idx contains negatives or zeros. Should have been dealt with earlier."; // e.g. test 762 (TODO-fix)
if (elem>max) return "Internal inefficiency: idx contains an item out-of-range. Should have been dealt with earlier."; // e.g. test 1639.64
anyNA |= elem==NA_INTEGER;
anyLess |= elem<last;
last = elem;
}
*anyNA_out = anyNA;
*orderedSubset_out = !anyLess; // for the purpose of ordered keys elem==last is allowed
return NULL;
}
SEXP convertNegAndZeroIdx(SEXP idx, SEXP maxArg, SEXP allowOverMax)
{
// called from [.data.table to massage user input, creating a new strictly positive idx if there are any negatives or zeros
// + more precise and helpful error messages telling user exactly where the problem is (saving user debugging time)
// + a little more efficient than negativeSubscript in src/main/subscript.c (it's private to R so we can't call it anyway)
// allowOverMaxArg is false when := (test 1024), otherwise true for selecting
if (!isInteger(idx)) error("Internal error. 'idx' is type '%s' not 'integer'", type2char(TYPEOF(idx))); // # nocov
if (!isInteger(maxArg) || length(maxArg)!=1) error("Internal error. 'maxArg' is type '%s' and length %d, should be an integer singleton", type2char(TYPEOF(maxArg)), length(maxArg)); // # nocov
if (!isLogical(allowOverMax) || LENGTH(allowOverMax)!=1 || LOGICAL(allowOverMax)[0]==NA_LOGICAL) error("Internal error: allowOverMax must be TRUE/FALSE"); // # nocov
int max = INTEGER(maxArg)[0], n=LENGTH(idx);
if (max<0) error("Internal error. max is %d, must be >= 0.", max); // # nocov includes NA which will print as INT_MIN
int *idxp = INTEGER(idx);
bool stop = false;
#pragma omp parallel for num_threads(getDTthreads())
for (int i=0; i<n; i++) {
if (stop) continue;
int elem = idxp[i];
if ((elem<1 && elem!=NA_INTEGER) || elem>max) stop=true;
}
if (!stop) return(idx); // most common case to return early: no 0, no negative; all idx either NA or in range [1-max]
// ---------
// else massage the input to a standard idx where all items are either NA or in range [1,max] ...
int countNeg=0, countZero=0, countNA=0, firstOverMax=0;
for (int i=0; i<n; i++) {
int elem = idxp[i];
if (elem==NA_INTEGER) countNA++;
else if (elem<0) countNeg++;
else if (elem==0) countZero++;
else if (elem>max && firstOverMax==0) firstOverMax=i+1;
}
if (firstOverMax && LOGICAL(allowOverMax)[0]==FALSE) {
error("i[%d] is %d which is out of range [1,nrow=%d]", firstOverMax, idxp[firstOverMax-1], max);
}
int countPos = n-countNeg-countZero-countNA;
if (countPos && countNeg) {
int i=0, firstNeg=0, firstPos=0;
while (i<n && (firstNeg==0 || firstPos==0)) {
int elem = idxp[i];
if (firstPos==0 && elem>0) firstPos=i+1;
if (firstNeg==0 && elem<0 && elem!=NA_INTEGER) firstNeg=i+1;
i++;
}
error("Item %d of i is %d and item %d is %d. Cannot mix positives and negatives.", firstNeg, idxp[firstNeg-1], firstPos, idxp[firstPos-1]);
}
if (countNeg && countNA) {
int i=0, firstNeg=0, firstNA=0;
while (i<n && (firstNeg==0 || firstNA==0)) {
int elem = idxp[i];
if (firstNeg==0 && elem<0 && elem!=NA_INTEGER) firstNeg=i+1;
if (firstNA==0 && elem==NA_INTEGER) firstNA=i+1;
i++;
}
error("Item %d of i is %d and item %d is NA. Cannot mix negatives and NA.", firstNeg, idxp[firstNeg-1], firstNA);
}
SEXP ans;
if (countNeg==0) {
// just zeros to remove, or >max to convert to NA
ans = PROTECT(allocVector(INTSXP, n - countZero));
int *ansp = INTEGER(ans);
for (int i=0, ansi=0; i<n; i++) {
int elem = idxp[i];
if (elem==0) continue;
ansp[ansi++] = elem>max ? NA_INTEGER : elem;
}
} else {
// idx is all negative without any NA but perhaps some zeros
bool *keep = (bool *)R_alloc(max, sizeof(bool)); // 4 times less memory that INTSXP in src/main/subscript.c
for (int i=0; i<max; i++) keep[i] = true;
int countRemoved=0, countDup=0, countBeyond=0; // idx=c(-10,-5,-10) removing row 10 twice
int firstBeyond=0, firstDup=0;
for (int i=0; i<n; i++) {
int elem = -idxp[i];
if (elem==0) continue;
if (elem>max) {
countBeyond++;
if (firstBeyond==0) firstBeyond=i+1;
continue;
}
if (!keep[elem-1]) {
countDup++;
if (firstDup==0) firstDup=i+1;
} else {
keep[elem-1] = false;
countRemoved++;
}
}
if (countBeyond)
warning("Item %d of i is %d but there are only %d rows. Ignoring this and %d more like it out of %d.", firstBeyond, idxp[firstBeyond-1], max, countBeyond-1, n);
if (countDup)
warning("Item %d of i is %d which removes that item but that has occurred before. Ignoring this dup and %d other dups.", firstDup, idxp[firstDup-1], countDup-1);
int ansn = max-countRemoved;
ans = PROTECT(allocVector(INTSXP, ansn));
int *ansp = INTEGER(ans);
for (int i=0, ansi=0; i<max; i++) {
if (keep[i]) ansp[ansi++] = i+1;
}
}
UNPROTECT(1);
return ans;
}
static void checkCol(SEXP col, int colNum, int nrow, SEXP x)
{
if (isNull(col)) error("Column %d is NULL; malformed data.table.", colNum);
if (isNewList(col) && INHERITS(col, char_dataframe)) {
SEXP names = getAttrib(x, R_NamesSymbol);
error("Column %d ['%s'] is a data.frame or data.table; malformed data.table.",
colNum, isNull(names)?"":CHAR(STRING_ELT(names,colNum-1)));
}
if (length(col)!=nrow) {
SEXP names = getAttrib(x, R_NamesSymbol);
error("Column %d ['%s'] is length %d but column 1 is length %d; malformed data.table.",
colNum, isNull(names)?"":CHAR(STRING_ELT(names,colNum-1)), length(col), nrow);
}
}
/*
* subsetDT - Subsets a data.table
* NOTE:
* 1) 'rows' and 'cols' are 1-based, passed from R level
* 2) Originally for subsetting vectors in fcast and now the beginnings of [.data.table ported to C
* 3) Immediate need is for R 3.1 as lglVec[1] now returns R's global TRUE and we don't want := to change that global [think 1 row data.tables]
* 4) Could do it other ways but may as well go to C now as we were going to do that anyway
*/
SEXP subsetDT(SEXP x, SEXP rows, SEXP cols) {
int nprotect=0;
if (!isNewList(x)) error("Internal error. Argument 'x' to CsubsetDT is type '%s' not 'list'", type2char(TYPEOF(rows))); // # nocov
if (!length(x)) return(x); // return empty list
const int nrow = length(VECTOR_ELT(x,0));
// check index once up front for 0 or NA, for branchless subsetVectorRaw which is repeated for each column
bool anyNA=false, orderedSubset=true; // true for when rows==null (meaning all rows)
if (!isNull(rows) && check_idx(rows, nrow, &anyNA, &orderedSubset)!=NULL) {
SEXP max = PROTECT(ScalarInteger(nrow)); nprotect++;
rows = PROTECT(convertNegAndZeroIdx(rows, max, ScalarLogical(TRUE))); nprotect++;
const char *err = check_idx(rows, nrow, &anyNA, &orderedSubset);
if (err!=NULL) error(err);
}
if (!isInteger(cols)) error("Internal error. Argument 'cols' to Csubset is type '%s' not 'integer'", type2char(TYPEOF(cols))); // # nocov
for (int i=0; i<LENGTH(cols); i++) {
int this = INTEGER(cols)[i];
if (this<1 || this>LENGTH(x)) error("Item %d of 'cols' is %d which is outside 1-based range [1,ncol(x)=%d]", i+1, this, LENGTH(x));
}
int overAlloc = checkOverAlloc(GetOption(install("datatable.alloccol"), R_NilValue));
SEXP ans = PROTECT(allocVector(VECSXP, LENGTH(cols)+overAlloc)); nprotect++; // doing alloc.col directly here; eventually alloc.col can be deprecated.
// user-defined and superclass attributes get copied as from v1.12.0
copyMostAttrib(x, ans);
// most means all except R_NamesSymbol, R_DimSymbol and R_DimNamesSymbol
// includes row.names (oddly, given other dims aren't) and "sorted" dealt with below
// class is also copied here which retains superclass name in class vector as has been the case for many years; e.g. tests 1228.* for #5296
SET_TRUELENGTH(ans, LENGTH(ans));
SETLENGTH(ans, LENGTH(cols));
int ansn;
if (isNull(rows)) {
ansn = nrow;
const int *colD = INTEGER(cols);
for (int i=0; i<LENGTH(cols); i++) {
SEXP thisCol = VECTOR_ELT(x, colD[i]-1);
checkCol(thisCol, colD[i], nrow, x);
SET_VECTOR_ELT(ans, i, duplicate(thisCol));
// materialize the column subset as we have always done for now, until REFCNT is on by default in R (TODO)
}
} else {
ansn = LENGTH(rows); // has been checked not to contain zeros or negatives, so this length is the length of result
const int *colD = INTEGER(cols);
for (int i=0; i<LENGTH(cols); i++) {
SEXP source = VECTOR_ELT(x, colD[i]-1);
checkCol(source, colD[i], nrow, x);
SEXP target;
SET_VECTOR_ELT(ans, i, target=allocVector(TYPEOF(source), ansn));
copyMostAttrib(source, target);
subsetVectorRaw(target, source, rows, anyNA); // parallel within column
}
}
SEXP tmp = PROTECT(allocVector(STRSXP, LENGTH(cols)+overAlloc)); nprotect++;
SET_TRUELENGTH(tmp, LENGTH(tmp));
SETLENGTH(tmp, LENGTH(cols));
setAttrib(ans, R_NamesSymbol, tmp);
subsetVectorRaw(tmp, getAttrib(x, R_NamesSymbol), cols, /*anyNA=*/false);
tmp = PROTECT(allocVector(INTSXP, 2)); nprotect++;
INTEGER(tmp)[0] = NA_INTEGER;
INTEGER(tmp)[1] = -ansn;
setAttrib(ans, R_RowNamesSymbol, tmp); // The contents of tmp must be set before being passed to setAttrib(). setAttrib looks at tmp value and copies it in the case of R_RowNamesSymbol. Caused hard to track bug around 28 Sep 2014.
// clear any index that was copied over by copyMostAttrib() above, e.g. #1760 and #1734 (test 1678)
setAttrib(ans, sym_index, R_NilValue);
// but maintain key if ordered subset
SEXP key = getAttrib(x, sym_sorted);
if (length(key)) {
SEXP in = PROTECT(chin(key, getAttrib(ans,R_NamesSymbol))); nprotect++;
int i = 0; while(i<LENGTH(key) && LOGICAL(in)[i]) i++;
// i is now the keylen that can be kept. 2 lines above much easier in C than R
if (i==0 || !orderedSubset) {
// clear key that was copied over by copyMostAttrib() above
setAttrib(ans, sym_sorted, R_NilValue);
} else {
// make a new key attribute; shorter if i<LENGTH(key) or same length copied so this key is safe to change by ref (setnames)
setAttrib(ans, sym_sorted, tmp=allocVector(STRSXP, i));
for (int j=0; j<i; j++) SET_STRING_ELT(tmp, j, STRING_ELT(key, j));
}
}
setAttrib(ans, install(".data.table.locked"), R_NilValue);
setselfref(ans);
UNPROTECT(nprotect);
return ans;
}
SEXP subsetVector(SEXP x, SEXP idx) { // idx is 1-based passed from R level
bool anyNA=false, orderedSubset=false;
int nprotect=0;
if (isNull(x))
error("Internal error: NULL can not be subset. It is invalid for a data.table to contain a NULL column."); // # nocov
if (check_idx(idx, length(x), &anyNA, &orderedSubset) != NULL)
error("Internal error: CsubsetVector is internal-use-only but has received negatives, zeros or out-of-range"); // # nocov
SEXP ans = PROTECT(allocVector(TYPEOF(x), length(idx))); nprotect++;
copyMostAttrib(x, ans);
subsetVectorRaw(ans, x, idx, anyNA);
UNPROTECT(nprotect);
return ans;
}