forked from talent-plan/tinysql
-
Notifications
You must be signed in to change notification settings - Fork 0
/
selectivity.go
314 lines (297 loc) · 10 KB
/
selectivity.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package statistics
import (
"math"
"github.com/pingcap/errors"
"github.com/pingcap/tidb/expression"
"github.com/pingcap/tidb/parser/ast"
"github.com/pingcap/tidb/parser/mysql"
planutil "github.com/pingcap/tidb/planner/util"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/ranger"
)
// If one condition can't be calculated, we will assume that the selectivity of this condition is 0.8.
const selectionFactor = 0.8
// StatsNode is used for calculating selectivity.
type StatsNode struct {
Tp int
ID int64
// mask is a bit pattern whose ith bit will indicate whether the ith expression is covered by this index/column.
mask int64
// Selectivity indicates the Selectivity of this column/index.
Selectivity float64
// numCols is the number of columns contained in the index or column(which is always 1).
numCols int
// partCover indicates whether the bit in the mask is for a full cover or partial cover. It is only true
// when the condition is a DNF expression on index, and the expression is not totally extracted as access condition.
partCover bool
}
// The type of the StatsNode.
const (
IndexType = iota
PkType
ColType
)
const unknownColumnID = math.MinInt64
// getConstantColumnID receives two expressions and if one of them is column and another is constant, it returns the
// ID of the column.
func getConstantColumnID(e []expression.Expression) int64 {
if len(e) != 2 {
return unknownColumnID
}
col, ok1 := e[0].(*expression.Column)
_, ok2 := e[1].(*expression.Constant)
if ok1 && ok2 {
return col.ID
}
col, ok1 = e[1].(*expression.Column)
_, ok2 = e[0].(*expression.Constant)
if ok1 && ok2 {
return col.ID
}
return unknownColumnID
}
func pseudoSelectivity(coll *HistColl, exprs []expression.Expression) float64 {
minFactor := selectionFactor
colExists := make(map[string]bool)
for _, expr := range exprs {
fun, ok := expr.(*expression.ScalarFunction)
if !ok {
continue
}
colID := getConstantColumnID(fun.GetArgs())
if colID == unknownColumnID {
continue
}
switch fun.FuncName.L {
case ast.EQ, ast.In:
minFactor = math.Min(minFactor, 1.0/pseudoEqualRate)
col, ok := coll.Columns[colID]
if !ok {
continue
}
colExists[col.Info.Name.L] = true
if mysql.HasUniKeyFlag(col.Info.Flag) {
return 1.0 / float64(coll.Count)
}
case ast.GE, ast.GT, ast.LE, ast.LT:
minFactor = math.Min(minFactor, 1.0/pseudoLessRate)
// FIXME: To resolve the between case.
}
}
if len(colExists) == 0 {
return minFactor
}
// use the unique key info
for _, idx := range coll.Indices {
if !idx.Info.Unique {
continue
}
unique := true
for _, col := range idx.Info.Columns {
if !colExists[col.Name.L] {
unique = false
break
}
}
if unique {
return 1.0 / float64(coll.Count)
}
}
return minFactor
}
// Selectivity is a function calculate the selectivity of the expressions.
// The definition of selectivity is (row count after filter / row count before filter).
// And exprs must be CNF now, in other words, `exprs[0] and exprs[1] and ... and exprs[len - 1]` should be held when you call this.
// Currently the time complexity is o(n^2).
func (coll *HistColl) Selectivity(ctx sessionctx.Context, exprs []expression.Expression, filledPaths []*planutil.AccessPath) (float64, error) {
// If table's count is zero or conditions are empty, we should return 100% selectivity.
if coll.Count == 0 || len(exprs) == 0 {
return 1, nil
}
// TODO: If len(exprs) is bigger than 63, we could use bitset structure to replace the int64.
// This will simplify some code and speed up if we use this rather than a boolean slice.
if len(exprs) > 63 || (len(coll.Columns) == 0 && len(coll.Indices) == 0) {
return pseudoSelectivity(coll, exprs), nil
}
ret := 1.0
var nodes []*StatsNode
sc := ctx.GetSessionVars().StmtCtx
remainedExprs := make([]expression.Expression, 0, len(exprs))
remainedExprs = append(remainedExprs, exprs...)
extractedCols := make([]*expression.Column, 0, len(coll.Columns))
extractedCols = expression.ExtractColumnsFromExpressions(extractedCols, remainedExprs, nil)
for id, colInfo := range coll.Columns {
col := expression.ColInfo2Col(extractedCols, colInfo.Info)
if col != nil {
maskCovered, ranges, _, err := getMaskAndRanges(ctx, remainedExprs, ranger.ColumnRangeType, nil, nil, col)
if err != nil {
return 0, errors.Trace(err)
}
nodes = append(nodes, &StatsNode{Tp: ColType, ID: id, mask: maskCovered, numCols: 1})
if colInfo.IsHandle {
nodes[len(nodes)-1].Tp = PkType
var cnt float64
cnt, err = coll.GetRowCountByIntColumnRanges(sc, id, ranges)
if err != nil {
return 0, errors.Trace(err)
}
nodes[len(nodes)-1].Selectivity = cnt / float64(coll.Count)
continue
}
cnt, err := coll.GetRowCountByColumnRanges(sc, id, ranges)
if err != nil {
return 0, errors.Trace(err)
}
nodes[len(nodes)-1].Selectivity = cnt / float64(coll.Count)
}
}
id2Paths := make(map[int64]*planutil.AccessPath)
for _, path := range filledPaths {
if path.IsTablePath {
continue
}
id2Paths[path.Index.ID] = path
}
for id, idxInfo := range coll.Indices {
idxCols := expression.FindPrefixOfIndex(extractedCols, coll.Idx2ColumnIDs[id])
if len(idxCols) > 0 {
lengths := make([]int, 0, len(idxCols))
for i := 0; i < len(idxCols); i++ {
lengths = append(lengths, idxInfo.Info.Columns[i].Length)
}
maskCovered, ranges, partCover, err := getMaskAndRanges(ctx, remainedExprs, ranger.IndexRangeType, lengths, id2Paths[idxInfo.ID], idxCols...)
if err != nil {
return 0, errors.Trace(err)
}
cnt, err := coll.GetRowCountByIndexRanges(sc, id, ranges)
if err != nil {
return 0, errors.Trace(err)
}
selectivity := cnt / float64(coll.Count)
nodes = append(nodes, &StatsNode{
Tp: IndexType,
ID: id,
mask: maskCovered,
numCols: len(idxInfo.Info.Columns),
Selectivity: selectivity,
partCover: partCover,
})
}
}
usedSets := getUsableSetsByGreedy(nodes)
// Initialize the mask with the full set.
mask := (int64(1) << uint(len(remainedExprs))) - 1
for _, set := range usedSets {
mask &^= set.mask
ret *= set.Selectivity
// If `partCover` is true, it means that the conditions are in DNF form, and only part
// of the DNF expressions are extracted as access conditions, so besides from the selectivity
// of the extracted access conditions, we multiply another selectionFactor for the residual
// conditions.
if set.partCover {
ret *= selectionFactor
}
}
// If there's still conditions which cannot be calculated, we will multiply a selectionFactor.
if mask > 0 {
ret *= selectionFactor
}
return ret, nil
}
func getMaskAndRanges(ctx sessionctx.Context, exprs []expression.Expression, rangeType ranger.RangeType, lengths []int, cachedPath *planutil.AccessPath, cols ...*expression.Column) (mask int64, ranges []*ranger.Range, partCover bool, err error) {
sc := ctx.GetSessionVars().StmtCtx
isDNF := false
var accessConds, remainedConds []expression.Expression
switch rangeType {
case ranger.ColumnRangeType:
accessConds = ranger.ExtractAccessConditionsForColumn(exprs, cols[0].UniqueID)
ranges, err = ranger.BuildColumnRange(accessConds, sc, cols[0].RetType, types.UnspecifiedLength)
case ranger.IndexRangeType:
if cachedPath != nil {
ranges, accessConds, remainedConds, isDNF = cachedPath.Ranges, cachedPath.AccessConds, cachedPath.TableFilters, cachedPath.IsDNFCond
break
}
var res *ranger.DetachRangeResult
res, err = ranger.DetachCondAndBuildRangeForIndex(ctx, exprs, cols, lengths)
ranges, accessConds, remainedConds, isDNF = res.Ranges, res.AccessConds, res.RemainedConds, res.IsDNFCond
if err != nil {
return 0, nil, false, err
}
default:
panic("should never be here")
}
if err != nil {
return 0, nil, false, err
}
if isDNF && len(accessConds) > 0 {
mask |= 1
return mask, ranges, len(remainedConds) > 0, nil
}
for i := range exprs {
for j := range accessConds {
if exprs[i].Equal(ctx, accessConds[j]) {
mask |= 1 << uint64(i)
break
}
}
}
return mask, ranges, false, nil
}
// getUsableSetsByGreedy will select the indices and pk used for calculate selectivity by greedy algorithm.
func getUsableSetsByGreedy(nodes []*StatsNode) (newBlocks []*StatsNode) {
marked := make([]bool, len(nodes))
mask := int64(math.MaxInt64)
for {
// Choose the index that covers most.
bestID, bestCount, bestTp, bestNumCols, bestMask := -1, 0, ColType, 0, int64(0)
for i, set := range nodes {
if marked[i] {
continue
}
curMask := set.mask & mask
bits := popCount(curMask)
// This set cannot cover any thing, just skip it.
if bits == 0 {
continue
}
// We greedy select the stats info based on:
// (1): The stats type, always prefer the primary key or index.
// (2): The number of expression that it covers, the more the better.
// (3): The number of columns that it contains, the less the better.
if (bestTp == ColType && set.Tp != ColType) || bestCount < bits || (bestCount == bits && bestNumCols > set.numCols) {
bestID, bestCount, bestTp, bestNumCols, bestMask = i, bits, set.Tp, set.numCols, curMask
}
}
if bestCount == 0 {
break
}
// Update the mask, remove the bit that nodes[bestID].mask has.
mask &^= bestMask
newBlocks = append(newBlocks, nodes[bestID])
marked[bestID] = true
}
return
}
// popCount is the digit sum of the binary representation of the number x.
func popCount(x int64) int {
ret := 0
// x -= x & -x, remove the lowest bit of the x.
// e.g. result will be 2 if x is 3.
for ; x > 0; x -= x & -x {
ret++
}
return ret
}