forked from huangzworks/redis-3.0-annotated
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathredis.c
4086 lines (3629 loc) · 157 KB
/
redis.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "redis.h"
#include "cluster.h"
#include "slowlog.h"
#include "bio.h"
#include <time.h>
#include <signal.h>
#include <sys/wait.h>
#include <errno.h>
#include <assert.h>
#include <ctype.h>
#include <stdarg.h>
#include <arpa/inet.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/uio.h>
#include <limits.h>
#include <float.h>
#include <math.h>
#include <sys/resource.h>
#include <sys/utsname.h>
#include <locale.h>
/* Our shared "common" objects */
struct sharedObjectsStruct shared;
/* Global vars that are actually used as constants. The following double
* values are used for double on-disk serialization, and are initialized
* at runtime to avoid strange compiler optimizations. */
double R_Zero, R_PosInf, R_NegInf, R_Nan;
/*================================= Globals ================================= */
/* Global vars */
struct redisServer server; /* server global state */
struct redisCommand *commandTable;
/* Our command table.
*
* 命令表
*
* Every entry is composed of the following fields:
*
* 表中的每个项都由以下域组成:
*
* name: a string representing the command name.
* 命令的名字
*
* function: pointer to the C function implementing the command.
* 一个指向命令的实现函数的指针
*
* arity: number of arguments, it is possible to use -N to say >= N
* 参数的数量。可以用 -N 表示 >= N
*
* sflags: command flags as string. See below for a table of flags.
* 字符串形式的 FLAG ,用来计算以下的真实 FLAG
*
* flags: flags as bitmask. Computed by Redis using the 'sflags' field.
* 位掩码形式的 FLAG ,根据 sflags 的字符串计算得出
*
* get_keys_proc: an optional function to get key arguments from a command.
* This is only used when the following three fields are not
* enough to specify what arguments are keys.
* 一个可选的函数,用于从命令中取出 key 参数,仅在以下三个参数都不足以表示 key 参数时使用
*
* first_key_index: first argument that is a key
* 第一个 key 参数的位置
*
* last_key_index: last argument that is a key
* 最后一个 key 参数的位置
*
* key_step: step to get all the keys from first to last argument. For instance
* in MSET the step is two since arguments are key,val,key,val,...
* 从 first 参数和 last 参数之间,所有 key 的步数(step)
* 比如说, MSET 命令的格式为 MSET key value [key value ...]
* 它的 step 就为 2
*
* microseconds: microseconds of total execution time for this command.
* 执行这个命令耗费的总微秒数
*
* calls: total number of calls of this command.
* 命令被执行的总次数
*
* The flags, microseconds and calls fields are computed by Redis and should
* always be set to zero.
*
* microseconds 和 call 由 Redis 计算,总是初始化为 0 。
*
* Command flags are expressed using strings where every character represents
* a flag. Later the populateCommandTable() function will take care of
* populating the real 'flags' field using this characters.
*
* 命令的 FLAG 首先由 SFLAG 域设置,之后 populateCommandTable() 函数从 sflags 属性中计算出真正的 FLAG 到 flags 属性中。
*
* This is the meaning of the flags:
*
* 以下是各个 FLAG 的意义:
*
* w: write command (may modify the key space).
* 写入命令,可能会修改 key space
*
* r: read command (will never modify the key space).
* 读命令,不修改 key space
* m: may increase memory usage once called. Don't allow if out of memory.
* 可能会占用大量内存的命令,调用时对内存占用进行检查
*
* a: admin command, like SAVE or SHUTDOWN.
* 管理用途的命令,比如 SAVE 和 SHUTDOWN
*
* p: Pub/Sub related command.
* 发布/订阅相关的命令
*
* f: force replication of this command, regardless of server.dirty.
* 无视 server.dirty ,强制复制这个命令。
*
* s: command not allowed in scripts.
* 不允许在脚本中使用的命令
*
* R: random command. Command is not deterministic, that is, the same command
* with the same arguments, with the same key space, may have different
* results. For instance SPOP and RANDOMKEY are two random commands.
* 随机命令。
* 命令是非确定性的:对于同样的命令,同样的参数,同样的键,结果可能不同。
* 比如 SPOP 和 RANDOMKEY 就是这样的例子。
*
* S: Sort command output array if called from script, so that the output
* is deterministic.
* 如果命令在 Lua 脚本中执行,那么对输出进行排序,从而得出确定性的输出。
*
* l: Allow command while loading the database.
* 允许在载入数据库时使用的命令。
*
* t: Allow command while a slave has stale data but is not allowed to
* server this data. Normally no command is accepted in this condition
* but just a few.
* 允许在附属节点带有过期数据时执行的命令。
* 这类命令很少有,只有几个。
*
* M: Do not automatically propagate the command on MONITOR.
* 不要在 MONITOR 模式下自动广播的命令。
*
* k: Perform an implicit ASKING for this command, so the command will be
* accepted in cluster mode if the slot is marked as 'importing'.
* 为这个命令执行一个显式的 ASKING ,
* 使得在集群模式下,一个被标示为 importing 的槽可以接收这命令。
*/
struct redisCommand redisCommandTable[] = {
{"get",getCommand,2,"r",0,NULL,1,1,1,0,0},
{"set",setCommand,-3,"wm",0,NULL,1,1,1,0,0},
{"setnx",setnxCommand,3,"wm",0,NULL,1,1,1,0,0},
{"setex",setexCommand,4,"wm",0,NULL,1,1,1,0,0},
{"psetex",psetexCommand,4,"wm",0,NULL,1,1,1,0,0},
{"append",appendCommand,3,"wm",0,NULL,1,1,1,0,0},
{"strlen",strlenCommand,2,"r",0,NULL,1,1,1,0,0},
{"del",delCommand,-2,"w",0,NULL,1,-1,1,0,0},
{"exists",existsCommand,2,"r",0,NULL,1,1,1,0,0},
{"setbit",setbitCommand,4,"wm",0,NULL,1,1,1,0,0},
{"getbit",getbitCommand,3,"r",0,NULL,1,1,1,0,0},
{"setrange",setrangeCommand,4,"wm",0,NULL,1,1,1,0,0},
{"getrange",getrangeCommand,4,"r",0,NULL,1,1,1,0,0},
{"substr",getrangeCommand,4,"r",0,NULL,1,1,1,0,0},
{"incr",incrCommand,2,"wm",0,NULL,1,1,1,0,0},
{"decr",decrCommand,2,"wm",0,NULL,1,1,1,0,0},
{"mget",mgetCommand,-2,"r",0,NULL,1,-1,1,0,0},
{"rpush",rpushCommand,-3,"wm",0,NULL,1,1,1,0,0},
{"lpush",lpushCommand,-3,"wm",0,NULL,1,1,1,0,0},
{"rpushx",rpushxCommand,3,"wm",0,NULL,1,1,1,0,0},
{"lpushx",lpushxCommand,3,"wm",0,NULL,1,1,1,0,0},
{"linsert",linsertCommand,5,"wm",0,NULL,1,1,1,0,0},
{"rpop",rpopCommand,2,"w",0,NULL,1,1,1,0,0},
{"lpop",lpopCommand,2,"w",0,NULL,1,1,1,0,0},
{"brpop",brpopCommand,-3,"ws",0,NULL,1,1,1,0,0},
{"brpoplpush",brpoplpushCommand,4,"wms",0,NULL,1,2,1,0,0},
{"blpop",blpopCommand,-3,"ws",0,NULL,1,-2,1,0,0},
{"llen",llenCommand,2,"r",0,NULL,1,1,1,0,0},
{"lindex",lindexCommand,3,"r",0,NULL,1,1,1,0,0},
{"lset",lsetCommand,4,"wm",0,NULL,1,1,1,0,0},
{"lrange",lrangeCommand,4,"r",0,NULL,1,1,1,0,0},
{"ltrim",ltrimCommand,4,"w",0,NULL,1,1,1,0,0},
{"lrem",lremCommand,4,"w",0,NULL,1,1,1,0,0},
{"rpoplpush",rpoplpushCommand,3,"wm",0,NULL,1,2,1,0,0},
{"sadd",saddCommand,-3,"wm",0,NULL,1,1,1,0,0},
{"srem",sremCommand,-3,"w",0,NULL,1,1,1,0,0},
{"smove",smoveCommand,4,"w",0,NULL,1,2,1,0,0},
{"sismember",sismemberCommand,3,"r",0,NULL,1,1,1,0,0},
{"scard",scardCommand,2,"r",0,NULL,1,1,1,0,0},
{"spop",spopCommand,2,"wRs",0,NULL,1,1,1,0,0},
{"srandmember",srandmemberCommand,-2,"rR",0,NULL,1,1,1,0,0},
{"sinter",sinterCommand,-2,"rS",0,NULL,1,-1,1,0,0},
{"sinterstore",sinterstoreCommand,-3,"wm",0,NULL,1,-1,1,0,0},
{"sunion",sunionCommand,-2,"rS",0,NULL,1,-1,1,0,0},
{"sunionstore",sunionstoreCommand,-3,"wm",0,NULL,1,-1,1,0,0},
{"sdiff",sdiffCommand,-2,"rS",0,NULL,1,-1,1,0,0},
{"sdiffstore",sdiffstoreCommand,-3,"wm",0,NULL,1,-1,1,0,0},
{"smembers",sinterCommand,2,"rS",0,NULL,1,1,1,0,0},
{"sscan",sscanCommand,-3,"rR",0,NULL,1,1,1,0,0},
{"zadd",zaddCommand,-4,"wm",0,NULL,1,1,1,0,0},
{"zincrby",zincrbyCommand,4,"wm",0,NULL,1,1,1,0,0},
{"zrem",zremCommand,-3,"w",0,NULL,1,1,1,0,0},
{"zremrangebyscore",zremrangebyscoreCommand,4,"w",0,NULL,1,1,1,0,0},
{"zremrangebyrank",zremrangebyrankCommand,4,"w",0,NULL,1,1,1,0,0},
{"zremrangebylex",zremrangebylexCommand,4,"w",0,NULL,1,1,1,0,0},
{"zunionstore",zunionstoreCommand,-4,"wm",0,zunionInterGetKeys,0,0,0,0,0},
{"zinterstore",zinterstoreCommand,-4,"wm",0,zunionInterGetKeys,0,0,0,0,0},
{"zrange",zrangeCommand,-4,"r",0,NULL,1,1,1,0,0},
{"zrangebyscore",zrangebyscoreCommand,-4,"r",0,NULL,1,1,1,0,0},
{"zrevrangebyscore",zrevrangebyscoreCommand,-4,"r",0,NULL,1,1,1,0,0},
{"zrangebylex",zrangebylexCommand,-4,"r",0,NULL,1,1,1,0,0},
{"zrevrangebylex",zrevrangebylexCommand,-4,"r",0,NULL,1,1,1,0,0},
{"zcount",zcountCommand,4,"r",0,NULL,1,1,1,0,0},
{"zlexcount",zlexcountCommand,4,"r",0,NULL,1,1,1,0,0},
{"zrevrange",zrevrangeCommand,-4,"r",0,NULL,1,1,1,0,0},
{"zcard",zcardCommand,2,"r",0,NULL,1,1,1,0,0},
{"zscore",zscoreCommand,3,"r",0,NULL,1,1,1,0,0},
{"zrank",zrankCommand,3,"r",0,NULL,1,1,1,0,0},
{"zrevrank",zrevrankCommand,3,"r",0,NULL,1,1,1,0,0},
{"zscan",zscanCommand,-3,"rR",0,NULL,1,1,1,0,0},
{"hset",hsetCommand,4,"wm",0,NULL,1,1,1,0,0},
{"hsetnx",hsetnxCommand,4,"wm",0,NULL,1,1,1,0,0},
{"hget",hgetCommand,3,"r",0,NULL,1,1,1,0,0},
{"hmset",hmsetCommand,-4,"wm",0,NULL,1,1,1,0,0},
{"hmget",hmgetCommand,-3,"r",0,NULL,1,1,1,0,0},
{"hincrby",hincrbyCommand,4,"wm",0,NULL,1,1,1,0,0},
{"hincrbyfloat",hincrbyfloatCommand,4,"wm",0,NULL,1,1,1,0,0},
{"hdel",hdelCommand,-3,"w",0,NULL,1,1,1,0,0},
{"hlen",hlenCommand,2,"r",0,NULL,1,1,1,0,0},
{"hkeys",hkeysCommand,2,"rS",0,NULL,1,1,1,0,0},
{"hvals",hvalsCommand,2,"rS",0,NULL,1,1,1,0,0},
{"hgetall",hgetallCommand,2,"r",0,NULL,1,1,1,0,0},
{"hexists",hexistsCommand,3,"r",0,NULL,1,1,1,0,0},
{"hscan",hscanCommand,-3,"rR",0,NULL,1,1,1,0,0},
{"incrby",incrbyCommand,3,"wm",0,NULL,1,1,1,0,0},
{"decrby",decrbyCommand,3,"wm",0,NULL,1,1,1,0,0},
{"incrbyfloat",incrbyfloatCommand,3,"wm",0,NULL,1,1,1,0,0},
{"getset",getsetCommand,3,"wm",0,NULL,1,1,1,0,0},
{"mset",msetCommand,-3,"wm",0,NULL,1,-1,2,0,0},
{"msetnx",msetnxCommand,-3,"wm",0,NULL,1,-1,2,0,0},
{"randomkey",randomkeyCommand,1,"rR",0,NULL,0,0,0,0,0},
{"select",selectCommand,2,"rl",0,NULL,0,0,0,0,0},
{"move",moveCommand,3,"w",0,NULL,1,1,1,0,0},
{"rename",renameCommand,3,"w",0,NULL,1,2,1,0,0},
{"renamenx",renamenxCommand,3,"w",0,NULL,1,2,1,0,0},
{"expire",expireCommand,3,"w",0,NULL,1,1,1,0,0},
{"expireat",expireatCommand,3,"w",0,NULL,1,1,1,0,0},
{"pexpire",pexpireCommand,3,"w",0,NULL,1,1,1,0,0},
{"pexpireat",pexpireatCommand,3,"w",0,NULL,1,1,1,0,0},
{"keys",keysCommand,2,"rS",0,NULL,0,0,0,0,0},
{"scan",scanCommand,-2,"rR",0,NULL,0,0,0,0,0},
{"dbsize",dbsizeCommand,1,"r",0,NULL,0,0,0,0,0},
{"auth",authCommand,2,"rslt",0,NULL,0,0,0,0,0},
{"ping",pingCommand,1,"rt",0,NULL,0,0,0,0,0},
{"echo",echoCommand,2,"r",0,NULL,0,0,0,0,0},
{"save",saveCommand,1,"ars",0,NULL,0,0,0,0,0},
{"bgsave",bgsaveCommand,1,"ar",0,NULL,0,0,0,0,0},
{"bgrewriteaof",bgrewriteaofCommand,1,"ar",0,NULL,0,0,0,0,0},
{"shutdown",shutdownCommand,-1,"arlt",0,NULL,0,0,0,0,0},
{"lastsave",lastsaveCommand,1,"rR",0,NULL,0,0,0,0,0},
{"type",typeCommand,2,"r",0,NULL,1,1,1,0,0},
{"multi",multiCommand,1,"rs",0,NULL,0,0,0,0,0},
{"exec",execCommand,1,"sM",0,NULL,0,0,0,0,0},
{"discard",discardCommand,1,"rs",0,NULL,0,0,0,0,0},
{"sync",syncCommand,1,"ars",0,NULL,0,0,0,0,0},
{"psync",syncCommand,3,"ars",0,NULL,0,0,0,0,0},
{"replconf",replconfCommand,-1,"arslt",0,NULL,0,0,0,0,0},
{"flushdb",flushdbCommand,1,"w",0,NULL,0,0,0,0,0},
{"flushall",flushallCommand,1,"w",0,NULL,0,0,0,0,0},
{"sort",sortCommand,-2,"wm",0,sortGetKeys,1,1,1,0,0},
{"info",infoCommand,-1,"rlt",0,NULL,0,0,0,0,0},
{"monitor",monitorCommand,1,"ars",0,NULL,0,0,0,0,0},
{"ttl",ttlCommand,2,"r",0,NULL,1,1,1,0,0},
{"pttl",pttlCommand,2,"r",0,NULL,1,1,1,0,0},
{"persist",persistCommand,2,"w",0,NULL,1,1,1,0,0},
{"slaveof",slaveofCommand,3,"ast",0,NULL,0,0,0,0,0},
{"debug",debugCommand,-2,"as",0,NULL,0,0,0,0,0},
{"config",configCommand,-2,"art",0,NULL,0,0,0,0,0},
{"subscribe",subscribeCommand,-2,"rpslt",0,NULL,0,0,0,0,0},
{"unsubscribe",unsubscribeCommand,-1,"rpslt",0,NULL,0,0,0,0,0},
{"psubscribe",psubscribeCommand,-2,"rpslt",0,NULL,0,0,0,0,0},
{"punsubscribe",punsubscribeCommand,-1,"rpslt",0,NULL,0,0,0,0,0},
{"publish",publishCommand,3,"pltr",0,NULL,0,0,0,0,0},
{"pubsub",pubsubCommand,-2,"pltrR",0,NULL,0,0,0,0,0},
{"watch",watchCommand,-2,"rs",0,NULL,1,-1,1,0,0},
{"unwatch",unwatchCommand,1,"rs",0,NULL,0,0,0,0,0},
{"cluster",clusterCommand,-2,"ar",0,NULL,0,0,0,0,0},
{"restore",restoreCommand,-4,"awm",0,NULL,1,1,1,0,0},
{"restore-asking",restoreCommand,-4,"awmk",0,NULL,1,1,1,0,0},
{"migrate",migrateCommand,-6,"aw",0,NULL,0,0,0,0,0},
{"asking",askingCommand,1,"r",0,NULL,0,0,0,0,0},
{"readonly",readonlyCommand,1,"r",0,NULL,0,0,0,0,0},
{"readwrite",readwriteCommand,1,"r",0,NULL,0,0,0,0,0},
{"dump",dumpCommand,2,"ar",0,NULL,1,1,1,0,0},
{"object",objectCommand,-2,"r",0,NULL,2,2,2,0,0},
{"client",clientCommand,-2,"ar",0,NULL,0,0,0,0,0},
{"eval",evalCommand,-3,"s",0,evalGetKeys,0,0,0,0,0},
{"evalsha",evalShaCommand,-3,"s",0,evalGetKeys,0,0,0,0,0},
{"slowlog",slowlogCommand,-2,"r",0,NULL,0,0,0,0,0},
{"script",scriptCommand,-2,"ras",0,NULL,0,0,0,0,0},
{"time",timeCommand,1,"rR",0,NULL,0,0,0,0,0},
{"bitop",bitopCommand,-4,"wm",0,NULL,2,-1,1,0,0},
{"bitcount",bitcountCommand,-2,"r",0,NULL,1,1,1,0,0},
{"bitpos",bitposCommand,-3,"r",0,NULL,1,1,1,0,0},
{"wait",waitCommand,3,"rs",0,NULL,0,0,0,0,0},
{"pfselftest",pfselftestCommand,1,"r",0,NULL,0,0,0,0,0},
{"pfadd",pfaddCommand,-2,"wm",0,NULL,1,1,1,0,0},
{"pfcount",pfcountCommand,-2,"w",0,NULL,1,1,1,0,0},
{"pfmerge",pfmergeCommand,-2,"wm",0,NULL,1,-1,1,0,0},
{"pfdebug",pfdebugCommand,-3,"w",0,NULL,0,0,0,0,0}
};
struct evictionPoolEntry *evictionPoolAlloc(void);
/*============================ Utility functions ============================ */
/* Low level logging. To use only for very big messages, otherwise
* redisLog() is to prefer. */
void redisLogRaw(int level, const char *msg) {
const int syslogLevelMap[] = { LOG_DEBUG, LOG_INFO, LOG_NOTICE, LOG_WARNING };
const char *c = ".-*#";
FILE *fp;
char buf[64];
int rawmode = (level & REDIS_LOG_RAW);
int log_to_stdout = server.logfile[0] == '\0';
level &= 0xff; /* clear flags */
if (level < server.verbosity) return;
fp = log_to_stdout ? stdout : fopen(server.logfile,"a");
if (!fp) return;
if (rawmode) {
fprintf(fp,"%s",msg);
} else {
int off;
struct timeval tv;
gettimeofday(&tv,NULL);
off = strftime(buf,sizeof(buf),"%d %b %H:%M:%S.",localtime(&tv.tv_sec));
snprintf(buf+off,sizeof(buf)-off,"%03d",(int)tv.tv_usec/1000);
fprintf(fp,"[%d] %s %c %s\n",(int)getpid(),buf,c[level],msg);
}
fflush(fp);
if (!log_to_stdout) fclose(fp);
if (server.syslog_enabled) syslog(syslogLevelMap[level], "%s", msg);
}
/* Like redisLogRaw() but with printf-alike support. This is the function that
* is used across the code. The raw version is only used in order to dump
* the INFO output on crash. */
void redisLog(int level, const char *fmt, ...) {
va_list ap;
char msg[REDIS_MAX_LOGMSG_LEN];
if ((level&0xff) < server.verbosity) return;
va_start(ap, fmt);
vsnprintf(msg, sizeof(msg), fmt, ap);
va_end(ap);
redisLogRaw(level,msg);
}
/* Log a fixed message without printf-alike capabilities, in a way that is
* safe to call from a signal handler.
*
* We actually use this only for signals that are not fatal from the point
* of view of Redis. Signals that are going to kill the server anyway and
* where we need printf-alike features are served by redisLog(). */
void redisLogFromHandler(int level, const char *msg) {
int fd;
int log_to_stdout = server.logfile[0] == '\0';
char buf[64];
if ((level&0xff) < server.verbosity || (log_to_stdout && server.daemonize))
return;
fd = log_to_stdout ? STDOUT_FILENO :
open(server.logfile, O_APPEND|O_CREAT|O_WRONLY, 0644);
if (fd == -1) return;
ll2string(buf,sizeof(buf),getpid());
if (write(fd,"[",1) == -1) goto err;
if (write(fd,buf,strlen(buf)) == -1) goto err;
if (write(fd," | signal handler] (",20) == -1) goto err;
ll2string(buf,sizeof(buf),time(NULL));
if (write(fd,buf,strlen(buf)) == -1) goto err;
if (write(fd,") ",2) == -1) goto err;
if (write(fd,msg,strlen(msg)) == -1) goto err;
if (write(fd,"\n",1) == -1) goto err;
err:
if (!log_to_stdout) close(fd);
}
/* Return the UNIX time in microseconds */
// 返回微秒格式的 UNIX 时间
// 1 秒 = 1 000 000 微秒
long long ustime(void) {
struct timeval tv;
long long ust;
gettimeofday(&tv, NULL);
ust = ((long long)tv.tv_sec)*1000000;
ust += tv.tv_usec;
return ust;
}
/* Return the UNIX time in milliseconds */
// 返回毫秒格式的 UNIX 时间
// 1 秒 = 1 000 毫秒
long long mstime(void) {
return ustime()/1000;
}
/* After an RDB dump or AOF rewrite we exit from children using _exit() instead of
* exit(), because the latter may interact with the same file objects used by
* the parent process. However if we are testing the coverage normal exit() is
* used in order to obtain the right coverage information. */
void exitFromChild(int retcode) {
#ifdef COVERAGE_TEST
exit(retcode);
#else
_exit(retcode);
#endif
}
/*====================== Hash table type implementation ==================== */
/* This is a hash table type that uses the SDS dynamic strings library as
* keys and radis objects as values (objects can hold SDS strings,
* lists, sets). */
void dictVanillaFree(void *privdata, void *val)
{
DICT_NOTUSED(privdata);
zfree(val);
}
void dictListDestructor(void *privdata, void *val)
{
DICT_NOTUSED(privdata);
listRelease((list*)val);
}
int dictSdsKeyCompare(void *privdata, const void *key1,
const void *key2)
{
int l1,l2;
DICT_NOTUSED(privdata);
l1 = sdslen((sds)key1);
l2 = sdslen((sds)key2);
if (l1 != l2) return 0;
return memcmp(key1, key2, l1) == 0;
}
/* A case insensitive version used for the command lookup table and other
* places where case insensitive non binary-safe comparison is needed. */
int dictSdsKeyCaseCompare(void *privdata, const void *key1,
const void *key2)
{
DICT_NOTUSED(privdata);
return strcasecmp(key1, key2) == 0;
}
void dictRedisObjectDestructor(void *privdata, void *val)
{
DICT_NOTUSED(privdata);
if (val == NULL) return; /* Values of swapped out keys as set to NULL */
decrRefCount(val);
}
void dictSdsDestructor(void *privdata, void *val)
{
DICT_NOTUSED(privdata);
sdsfree(val);
}
int dictObjKeyCompare(void *privdata, const void *key1,
const void *key2)
{
const robj *o1 = key1, *o2 = key2;
return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
}
unsigned int dictObjHash(const void *key) {
const robj *o = key;
return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
}
unsigned int dictSdsHash(const void *key) {
return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
}
unsigned int dictSdsCaseHash(const void *key) {
return dictGenCaseHashFunction((unsigned char*)key, sdslen((char*)key));
}
int dictEncObjKeyCompare(void *privdata, const void *key1,
const void *key2)
{
robj *o1 = (robj*) key1, *o2 = (robj*) key2;
int cmp;
if (o1->encoding == REDIS_ENCODING_INT &&
o2->encoding == REDIS_ENCODING_INT)
return o1->ptr == o2->ptr;
o1 = getDecodedObject(o1);
o2 = getDecodedObject(o2);
cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
decrRefCount(o1);
decrRefCount(o2);
return cmp;
}
unsigned int dictEncObjHash(const void *key) {
robj *o = (robj*) key;
if (sdsEncodedObject(o)) {
return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
} else {
if (o->encoding == REDIS_ENCODING_INT) {
char buf[32];
int len;
len = ll2string(buf,32,(long)o->ptr);
return dictGenHashFunction((unsigned char*)buf, len);
} else {
unsigned int hash;
o = getDecodedObject(o);
hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
decrRefCount(o);
return hash;
}
}
}
/* Sets type hash table */
dictType setDictType = {
dictEncObjHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictEncObjKeyCompare, /* key compare */
dictRedisObjectDestructor, /* key destructor */
NULL /* val destructor */
};
/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
dictType zsetDictType = {
dictEncObjHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictEncObjKeyCompare, /* key compare */
dictRedisObjectDestructor, /* key destructor */
NULL /* val destructor */
};
/* Db->dict, keys are sds strings, vals are Redis objects. */
dictType dbDictType = {
dictSdsHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictSdsKeyCompare, /* key compare */
dictSdsDestructor, /* key destructor */
dictRedisObjectDestructor /* val destructor */
};
/* server.lua_scripts sha (as sds string) -> scripts (as robj) cache. */
dictType shaScriptObjectDictType = {
dictSdsCaseHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictSdsKeyCaseCompare, /* key compare */
dictSdsDestructor, /* key destructor */
dictRedisObjectDestructor /* val destructor */
};
/* Db->expires */
dictType keyptrDictType = {
dictSdsHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictSdsKeyCompare, /* key compare */
NULL, /* key destructor */
NULL /* val destructor */
};
/* Command table. sds string -> command struct pointer. */
dictType commandTableDictType = {
dictSdsCaseHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictSdsKeyCaseCompare, /* key compare */
dictSdsDestructor, /* key destructor */
NULL /* val destructor */
};
/* Hash type hash table (note that small hashes are represented with ziplists) */
dictType hashDictType = {
dictEncObjHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictEncObjKeyCompare, /* key compare */
dictRedisObjectDestructor, /* key destructor */
dictRedisObjectDestructor /* val destructor */
};
/* Keylist hash table type has unencoded redis objects as keys and
* lists as values. It's used for blocking operations (BLPOP) and to
* map swapped keys to a list of clients waiting for this keys to be loaded. */
dictType keylistDictType = {
dictObjHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictObjKeyCompare, /* key compare */
dictRedisObjectDestructor, /* key destructor */
dictListDestructor /* val destructor */
};
/* Cluster nodes hash table, mapping nodes addresses 1.2.3.4:6379 to
* clusterNode structures. */
dictType clusterNodesDictType = {
dictSdsHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictSdsKeyCompare, /* key compare */
dictSdsDestructor, /* key destructor */
NULL /* val destructor */
};
/* Cluster re-addition blacklist. This maps node IDs to the time
* we can re-add this node. The goal is to avoid readding a removed
* node for some time. */
dictType clusterNodesBlackListDictType = {
dictSdsCaseHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictSdsKeyCaseCompare, /* key compare */
dictSdsDestructor, /* key destructor */
NULL /* val destructor */
};
/* Migrate cache dict type. */
dictType migrateCacheDictType = {
dictSdsHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictSdsKeyCompare, /* key compare */
dictSdsDestructor, /* key destructor */
NULL /* val destructor */
};
/* Replication cached script dict (server.repl_scriptcache_dict).
* Keys are sds SHA1 strings, while values are not used at all in the current
* implementation. */
dictType replScriptCacheDictType = {
dictSdsCaseHash, /* hash function */
NULL, /* key dup */
NULL, /* val dup */
dictSdsKeyCaseCompare, /* key compare */
dictSdsDestructor, /* key destructor */
NULL /* val destructor */
};
int htNeedsResize(dict *dict) {
long long size, used;
size = dictSlots(dict);
used = dictSize(dict);
return (size && used && size > DICT_HT_INITIAL_SIZE &&
(used*100/size < REDIS_HT_MINFILL));
}
/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
* we resize the hash table to save memory */
// 如果字典的使用率比 REDIS_HT_MINFILL 常量要低
// 那么通过缩小字典的体积来节约内存
void tryResizeHashTables(int dbid) {
if (htNeedsResize(server.db[dbid].dict))
dictResize(server.db[dbid].dict);
if (htNeedsResize(server.db[dbid].expires))
dictResize(server.db[dbid].expires);
}
/* Our hash table implementation performs rehashing incrementally while
* we write/read from the hash table. Still if the server is idle, the hash
* table will use two tables for a long time. So we try to use 1 millisecond
* of CPU time at every call of this function to perform some rehahsing.
*
* 虽然服务器在对数据库执行读取/写入命令时会对数据库进行渐进式 rehash ,
* 但如果服务器长期没有执行命令的话,数据库字典的 rehash 就可能一直没办法完成,
* 为了防止出现这种情况,我们需要对数据库执行主动 rehash 。
*
* The function returns 1 if some rehashing was performed, otherwise 0
* is returned.
*
* 函数在执行了主动 rehash 时返回 1 ,否则返回 0 。
*/
int incrementallyRehash(int dbid) {
/* Keys dictionary */
if (dictIsRehashing(server.db[dbid].dict)) {
dictRehashMilliseconds(server.db[dbid].dict,1);
return 1; /* already used our millisecond for this loop... */
}
/* Expires */
if (dictIsRehashing(server.db[dbid].expires)) {
dictRehashMilliseconds(server.db[dbid].expires,1);
return 1; /* already used our millisecond for this loop... */
}
return 0;
}
/* This function is called once a background process of some kind terminates,
* as we want to avoid resizing the hash tables when there is a child in order
* to play well with copy-on-write (otherwise when a resize happens lots of
* memory pages are copied). The goal of this function is to update the ability
* for dict.c to resize the hash tables accordingly to the fact we have o not
* running childs. */
void updateDictResizePolicy(void) {
if (server.rdb_child_pid == -1 && server.aof_child_pid == -1)
dictEnableResize();
else
dictDisableResize();
}
/* ======================= Cron: called every 100 ms ======================== */
/* Helper function for the activeExpireCycle() function.
* This function will try to expire the key that is stored in the hash table
* entry 'de' of the 'expires' hash table of a Redis database.
*
* activeExpireCycle() 函数使用的检查键是否过期的辅佐函数。
*
* If the key is found to be expired, it is removed from the database and
* 1 is returned. Otherwise no operation is performed and 0 is returned.
*
* 如果 de 中的键已经过期,那么移除它,并返回 1 ,否则不做动作,并返回 0 。
*
* When a key is expired, server.stat_expiredkeys is incremented.
*
* The parameter 'now' is the current time in milliseconds as is passed
* to the function to avoid too many gettimeofday() syscalls.
*
* 参数 now 是毫秒格式的当前时间
*/
int activeExpireCycleTryExpire(redisDb *db, dictEntry *de, long long now) {
// 获取键的过期时间
long long t = dictGetSignedIntegerVal(de);
if (now > t) {
// 键已过期
sds key = dictGetKey(de);
robj *keyobj = createStringObject(key,sdslen(key));
// 传播过期命令
propagateExpire(db,keyobj);
// 从数据库中删除该键
dbDelete(db,keyobj);
// 发送事件
notifyKeyspaceEvent(REDIS_NOTIFY_EXPIRED,
"expired",keyobj,db->id);
decrRefCount(keyobj);
// 更新计数器
server.stat_expiredkeys++;
return 1;
} else {
// 键未过期
return 0;
}
}
/* Try to expire a few timed out keys. The algorithm used is adaptive and
* will use few CPU cycles if there are few expiring keys, otherwise
* it will get more aggressive to avoid that too much memory is used by
* keys that can be removed from the keyspace.
*
* 函数尝试删除数据库中已经过期的键。
* 当带有过期时间的键比较少时,函数运行得比较保守,
* 如果带有过期时间的键比较多,那么函数会以更积极的方式来删除过期键,
* 从而可能地释放被过期键占用的内存。
*
* No more than REDIS_DBCRON_DBS_PER_CALL databases are tested at every
* iteration.
*
* 每次循环中被测试的数据库数目不会超过 REDIS_DBCRON_DBS_PER_CALL 。
*
* This kind of call is used when Redis detects that timelimit_exit is
* true, so there is more work to do, and we do it more incrementally from
* the beforeSleep() function of the event loop.
*
* 如果 timelimit_exit 为真,那么说明还有更多删除工作要做,
* 那么在 beforeSleep() 函数调用时,程序会再次执行这个函数。
*
* Expire cycle type:
*
* 过期循环的类型:
*
* If type is ACTIVE_EXPIRE_CYCLE_FAST the function will try to run a
* "fast" expire cycle that takes no longer than EXPIRE_FAST_CYCLE_DURATION
* microseconds, and is not repeated again before the same amount of time.
*
* 如果循环的类型为 ACTIVE_EXPIRE_CYCLE_FAST ,
* 那么函数会以“快速过期”模式执行,
* 执行的时间不会长过 EXPIRE_FAST_CYCLE_DURATION 毫秒,
* 并且在 EXPIRE_FAST_CYCLE_DURATION 毫秒之内不会再重新执行。
*
* If type is ACTIVE_EXPIRE_CYCLE_SLOW, that normal expire cycle is
* executed, where the time limit is a percentage of the REDIS_HZ period
* as specified by the REDIS_EXPIRELOOKUPS_TIME_PERC define.
*
* 如果循环的类型为 ACTIVE_EXPIRE_CYCLE_SLOW ,
* 那么函数会以“正常过期”模式执行,
* 函数的执行时限为 REDIS_HS 常量的一个百分比,
* 这个百分比由 REDIS_EXPIRELOOKUPS_TIME_PERC 定义。
*/
void activeExpireCycle(int type) {
/* This function has some global state in order to continue the work
* incrementally across calls. */
// 静态变量,用来累积函数连续执行时的数据
static unsigned int current_db = 0; /* Last DB tested. */
static int timelimit_exit = 0; /* Time limit hit in previous call? */
static long long last_fast_cycle = 0; /* When last fast cycle ran. */
unsigned int j, iteration = 0;
// 默认每次处理的数据库数量
unsigned int dbs_per_call = REDIS_DBCRON_DBS_PER_CALL;
// 函数开始的时间
long long start = ustime(), timelimit;
// 快速模式
if (type == ACTIVE_EXPIRE_CYCLE_FAST) {
/* Don't start a fast cycle if the previous cycle did not exited
* for time limt. Also don't repeat a fast cycle for the same period
* as the fast cycle total duration itself. */
// 如果上次函数没有触发 timelimit_exit ,那么不执行处理
if (!timelimit_exit) return;
// 如果距离上次执行未够一定时间,那么不执行处理
if (start < last_fast_cycle + ACTIVE_EXPIRE_CYCLE_FAST_DURATION*2) return;
// 运行到这里,说明执行快速处理,记录当前时间
last_fast_cycle = start;
}
/* We usually should test REDIS_DBCRON_DBS_PER_CALL per iteration, with
* two exceptions:
*
* 一般情况下,函数只处理 REDIS_DBCRON_DBS_PER_CALL 个数据库,
* 除非:
*
* 1) Don't test more DBs than we have.
* 当前数据库的数量小于 REDIS_DBCRON_DBS_PER_CALL
* 2) If last time we hit the time limit, we want to scan all DBs
* in this iteration, as there is work to do in some DB and we don't want
* expired keys to use memory for too much time.
* 如果上次处理遇到了时间上限,那么这次需要对所有数据库进行扫描,
* 这可以避免过多的过期键占用空间
*/
if (dbs_per_call > server.dbnum || timelimit_exit)
dbs_per_call = server.dbnum;
/* We can use at max ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC percentage of CPU time
* per iteration. Since this function gets called with a frequency of
* server.hz times per second, the following is the max amount of
* microseconds we can spend in this function. */
// 函数处理的微秒时间上限
// ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC 默认为 25 ,也即是 25 % 的 CPU 时间
timelimit = 1000000*ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC/server.hz/100;
timelimit_exit = 0;
if (timelimit <= 0) timelimit = 1;
// 如果是运行在快速模式之下
// 那么最多只能运行 FAST_DURATION 微秒
// 默认值为 1000 (微秒)
if (type == ACTIVE_EXPIRE_CYCLE_FAST)
timelimit = ACTIVE_EXPIRE_CYCLE_FAST_DURATION; /* in microseconds. */
// 遍历数据库
for (j = 0; j < dbs_per_call; j++) {
int expired;
// 指向要处理的数据库
redisDb *db = server.db+(current_db % server.dbnum);
/* Increment the DB now so we are sure if we run out of time
* in the current DB we'll restart from the next. This allows to
* distribute the time evenly across DBs. */
// 为 DB 计数器加一,如果进入 do 循环之后因为超时而跳出
// 那么下次会直接从下个 DB 开始处理
current_db++;
/* Continue to expire if at the end of the cycle more than 25%
* of the keys were expired. */
do {
unsigned long num, slots;
long long now, ttl_sum;
int ttl_samples;
/* If there is nothing to expire try next DB ASAP. */
// 获取数据库中带过期时间的键的数量
// 如果该数量为 0 ,直接跳过这个数据库
if ((num = dictSize(db->expires)) == 0) {
db->avg_ttl = 0;
break;
}
// 获取数据库中键值对的数量
slots = dictSlots(db->expires);
// 当前时间
now = mstime();
/* When there are less than 1% filled slots getting random
* keys is expensive, so stop here waiting for better times...
* The dictionary will be resized asap. */
// 这个数据库的使用率低于 1% ,扫描起来太费力了(大部分都会 MISS)
// 跳过,等待字典收缩程序运行
if (num && slots > DICT_HT_INITIAL_SIZE &&
(num*100/slots < 1)) break;
/* The main collection cycle. Sample random keys among keys
* with an expire set, checking for expired ones.
*
* 样本计数器
*/
// 已处理过期键计数器
expired = 0;
// 键的总 TTL 计数器
ttl_sum = 0;
// 总共处理的键计数器
ttl_samples = 0;
// 每次最多只能检查 LOOKUPS_PER_LOOP 个键
if (num > ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP)
num = ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP;
// 开始遍历数据库
while (num--) {
dictEntry *de;
long long ttl;
// 从 expires 中随机取出一个带过期时间的键
if ((de = dictGetRandomKey(db->expires)) == NULL) break;
// 计算 TTL
ttl = dictGetSignedIntegerVal(de)-now;
// 如果键已经过期,那么删除它,并将 expired 计数器增一
if (activeExpireCycleTryExpire(db,de,now)) expired++;
if (ttl < 0) ttl = 0;
// 累积键的 TTL
ttl_sum += ttl;
// 累积处理键的个数
ttl_samples++;
}
/* Update the average TTL stats for this database. */
// 为这个数据库更新平均 TTL 统计数据
if (ttl_samples) {
// 计算当前平均值
long long avg_ttl = ttl_sum/ttl_samples;
// 如果这是第一次设置数据库平均 TTL ,那么进行初始化
if (db->avg_ttl == 0) db->avg_ttl = avg_ttl;
/* Smooth the value averaging with the previous one. */
// 取数据库的上次平均 TTL 和今次平均 TTL 的平均值