forked from BrianGladman/mpir
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspeed.h
3365 lines (3143 loc) · 108 KB
/
speed.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Header for speed and threshold things.
Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010, 2011,
2012 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at your
option) any later version.
The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
*/
#ifndef __SPEED_H__
#define __SPEED_H__
/* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
newsize long. */
#define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \
do { \
ASSERT ((newsize) >= (oldsize)); \
MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \
} while (0)
/* A mask of the least significant n bits. Note 1<<32 doesn't give zero on
x86 family CPUs, hence the separate case for BITS_PER_MP_LIMB. */
#define MP_LIMB_T_LOWBITMASK(n) \
((n) == BITS_PER_MP_LIMB ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
/* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
#define TMP_ALLOC_ALIGNED(bytes, align) \
align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
#define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \
((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
/* CACHE_LINE_SIZE is our default alignment for speed operands, and the
limit on what s->align_xp etc and then request for off-alignment. Maybe
this should be an option of some sort, but in any case here are some line
sizes,
bytes
32 pentium
64 athlon
64 itanium-2 L1
128 itanium-2 L2
*/
#define CACHE_LINE_SIZE 64 /* bytes */
#define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
/* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
alignment. */
#define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align) \
do { \
mp_ptr __ptr; \
mp_size_t __ptr_align, __ptr_add; \
\
ASSERT ((CACHE_LINE_SIZE % BYTES_PER_MP_LIMB) == 0); \
__ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK); \
__ptr_align = (__ptr - (mp_ptr) NULL); \
__ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK; \
(ptr) = __ptr + __ptr_add; \
} while (0)
/* This is the size for s->xp_block and s->yp_block, used in certain
routines that want to run across many different data values and use
s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
512 means 2kbytes of data for each of xp_block and yp_block, making 4k
total, which should fit easily in any L1 data cache. */
#define SPEED_BLOCK_SIZE 512 /* limbs */
#ifdef _MSC_VER
#include "win_timing.h"
#else
extern double speed_unittime;
extern double speed_cycletime;
extern int speed_precision;
extern char speed_time_string[];
void speed_time_init(void);
void speed_cycletime_fail(const char *str);
void speed_cycletime_init(void);
void speed_cycletime_need_cycles(void);
void speed_cycletime_need_seconds(void);
void speed_starttime(void);
double speed_endtime(void);
#endif
struct speed_params {
unsigned reps; /* how many times to run the routine */
mp_ptr xp; /* first argument */
mp_ptr yp; /* second argument */
mp_size_t size; /* size of both arguments */
mp_limb_t r; /* user supplied parameter */
mp_size_t align_xp; /* alignment of xp */
mp_size_t align_yp; /* alignment of yp */
mp_size_t align_wp; /* intended alignment of wp */
mp_size_t align_wp2; /* intended alignment of wp2 */
mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */
mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */
double time_divisor; /* optionally set by the speed routine */
/* used by the cache priming things */
int cache;
unsigned src_num, dst_num;
struct {
mp_ptr ptr;
mp_size_t size;
} src[5], dst[5];
};
typedef double (*speed_function_t)(struct speed_params *s);
double speed_measure(speed_function_t fun, struct speed_params *s);
/* Prototypes for speed measuring routines */
double speed_back_to_back(struct speed_params *s);
double speed_count_leading_zeros(struct speed_params *s);
double speed_count_trailing_zeros(struct speed_params *s);
double speed_gmp_allocate_free(struct speed_params *s);
double speed_gmp_allocate_reallocate_free(struct speed_params *s);
double speed_invert_limb(struct speed_params *s);
double speed_malloc_free(struct speed_params *s);
double speed_malloc_realloc_free(struct speed_params *s);
double speed_memcpy(struct speed_params *s);
double speed_modlimb_invert(struct speed_params *s);
double speed_modlimb_invert_mul1(struct speed_params *s);
double speed_modlimb_invert_loop(struct speed_params *s);
double speed_modlimb_invert_cond(struct speed_params *s);
double speed_modlimb_invert_arith(struct speed_params *s);
double speed_mpf_init_clear(struct speed_params *s);
double speed_mpn_add_err1_n(struct speed_params *s);
double speed_mpn_sub_err1_n(struct speed_params *s);
double speed_mpn_add_n(struct speed_params *s);
double speed_mpn_addadd_n(struct speed_params *s);
double speed_mpn_subadd_n(struct speed_params *s);
double speed_mpn_addlsh1_n(struct speed_params *s);
double speed_mpn_addlsh_n(struct speed_params *s);
double speed_mpn_sublsh_n(struct speed_params *s);
double speed_mpn_declsh_n(struct speed_params *s);
double speed_mpn_inclsh_n(struct speed_params *s);
double speed_mpn_sumdiff_n(struct speed_params *s);
double speed_mpn_nsumdiff_n(struct speed_params *s);
double speed_mpn_and_n(struct speed_params *s);
double speed_mpn_andn_n(struct speed_params *s);
double speed_mpn_addmul_1(struct speed_params *s);
double speed_mpn_addmul_2(struct speed_params *s);
double speed_mpn_addmul_3(struct speed_params *s);
double speed_mpn_addmul_4(struct speed_params *s);
double speed_mpn_addmul_5(struct speed_params *s);
double speed_mpn_addmul_6(struct speed_params *s);
double speed_mpn_addmul_7(struct speed_params *s);
double speed_mpn_addmul_8(struct speed_params *s);
double speed_mpn_addsub_n(struct speed_params *s);
double speed_mpn_com_n(struct speed_params *s);
double speed_mpn_not(struct speed_params *s);
double speed_mpn_copyd(struct speed_params *s);
double speed_mpn_copyi(struct speed_params *s);
double speed_mpn_dc_tdiv_qr(struct speed_params *s);
double speed_mpn_dc_div_qr_n(struct speed_params *s);
double speed_mpn_tdiv_q(struct speed_params *s);
double speed_mpn_tdiv_q1(struct speed_params *s);
double speed_mpn_sb_divappr_q(struct speed_params *s);
double speed_mpn_sb_div_qr(struct speed_params *s);
double speed_mpn_dc_divappr_q(struct speed_params *s);
double speed_mpn_dc_bdiv_q(struct speed_params *s);
double speed_mpn_dc_bdiv_qr_n(struct speed_params *s);
double speed_mpn_inv_div_qr(struct speed_params *s);
double speed_mpn_inv_divappr_q(struct speed_params *s);
double speed_mpn_tdiv_q(struct speed_params *s);
double speed_mpn_tdiv_q2(struct speed_params *s);
double speed_MPN_COPY(struct speed_params *s);
double speed_MPN_COPY_DECR(struct speed_params *s);
double speed_MPN_COPY_INCR(struct speed_params *s);
double speed_mpn_divexact_1(struct speed_params *s);
double speed_mpn_divexact_by3(struct speed_params *s);
double speed_mpn_divexact_byff(struct speed_params *s);
double speed_mpn_divexact_byfobm1(struct speed_params *s);
double speed_mpn_divrem_euclidean_qr_1(struct speed_params *s);
double speed_mpn_divrem_euclidean_qr_2(struct speed_params *s);
double speed_mpn_divrem_euclidean_r_1(struct speed_params *s);
double speed_mpn_divrem_hensel_qr_1(struct speed_params *s);
double speed_mpn_divrem_hensel_qr_1_1(struct speed_params *s);
double speed_mpn_divrem_hensel_qr_1_2(struct speed_params *s);
double speed_mpn_divrem_hensel_r_1(struct speed_params *s);
double speed_mpn_rsh_divrem_hensel_qr_1(struct speed_params *s);
double speed_mpn_rsh_divrem_hensel_qr_1_1(struct speed_params *s);
double speed_mpn_rsh_divrem_hensel_qr_1_2(struct speed_params *s);
double speed_mpn_divrem_hensel_rsh_qr_1(struct speed_params *s);
double speed_mpn_divrem_1(struct speed_params *s);
double speed_mpn_divrem_1f(struct speed_params *s);
double speed_mpn_divrem_1c(struct speed_params *s);
double speed_mpn_divrem_1cf(struct speed_params *s);
double speed_mpn_divrem_1_div(struct speed_params *s);
double speed_mpn_divrem_1f_div(struct speed_params *s);
double speed_mpn_divrem_1_inv(struct speed_params *s);
double speed_mpn_divrem_1f_inv(struct speed_params *s);
double speed_mpn_divrem_2(struct speed_params *s);
double speed_mpn_divrem_2_div(struct speed_params *s);
double speed_mpn_divrem_2_inv(struct speed_params *s);
double speed_mpn_double(struct speed_params *s);
double speed_mpn_half(struct speed_params *s);
double speed_mpn_fib2_ui(struct speed_params *s);
double speed_mpn_matrix22_mul (struct speed_params *);
double speed_mpn_hgcd (struct speed_params *);
double speed_mpn_hgcd_lehmer (struct speed_params *);
double speed_mpn_hgcd_appr (struct speed_params *);
double speed_mpn_hgcd_appr_lehmer (struct speed_params *);
double speed_mpn_hgcd_reduce (struct speed_params *);
double speed_mpn_hgcd_reduce_1 (struct speed_params *);
double speed_mpn_hgcd_reduce_2 (struct speed_params *);
double speed_mpn_gcd (struct speed_params *);
double speed_mpn_gcd_1 (struct speed_params *);
double speed_mpn_gcd_1N (struct speed_params *);
double speed_mpn_gcdext (struct speed_params *);
double speed_mpn_gcdext_double (struct speed_params *);
double speed_mpn_gcdext_one_double (struct speed_params *);
double speed_mpn_gcdext_one_single (struct speed_params *);
double speed_mpn_gcdext_single (struct speed_params *);
double speed_mpn_get_str(struct speed_params *s);
double speed_mpn_hamdist(struct speed_params *s);
double speed_mpn_ior_n(struct speed_params *s);
double speed_mpn_iorn_n(struct speed_params *s);
double speed_mpn_jacobi_base(struct speed_params *s);
double speed_mpn_jacobi_base_1(struct speed_params *s);
double speed_mpn_jacobi_base_2(struct speed_params *s);
double speed_mpn_jacobi_base_3(struct speed_params *s);
double speed_mpn_jacobi_base_4(struct speed_params *s);
double speed_mpn_kara_mul_n(struct speed_params *s);
double speed_mpn_kara_sqr_n(struct speed_params *s);
double speed_mpn_karaadd(struct speed_params *s);
double speed_mpn_karasub(struct speed_params *s);
double speed_mpn_lshift(struct speed_params *s);
double speed_mpn_lshift1(struct speed_params *s);
double speed_mpn_lshift2(struct speed_params *s);
double speed_mpn_lshiftc(struct speed_params *s);
double speed_mpn_mod_1(struct speed_params *s);
double speed_mpn_mod_1_1(struct speed_params *s);
double speed_mpn_mod_1_2(struct speed_params *s);
double speed_mpn_mod_1_3(struct speed_params *s);
double speed_mpn_mod_1_k(struct speed_params *s);
double speed_mpn_mod_1c(struct speed_params *s);
double speed_mpn_mod_1_div(struct speed_params *s);
double speed_mpn_mod_1_inv(struct speed_params *s);
double speed_mpn_mod_34lsub1(struct speed_params *s);
double speed_mpn_modexact_1_odd(struct speed_params *s);
double speed_mpn_modexact_1c_odd(struct speed_params *s);
double speed_mpn_mul_1(struct speed_params *s);
double speed_mpn_mul_1_inplace(struct speed_params *s);
double speed_mpn_mul_2(struct speed_params *s);
double speed_mpn_mul_basecase(struct speed_params *s);
double speed_mpn_mul_fft_main(struct speed_params *s);
double speed_mpn_sqr_fft_main(struct speed_params *s);
double speed_mpn_mul_n(struct speed_params *s);
double speed_mpn_mul_n_sqr(struct speed_params *s);
double speed_mpn_mullow_n(struct speed_params *s);
double speed_mpn_mulhigh_n(struct speed_params *s);
double speed_mpn_mulmod_2expm1(struct speed_params *s);
double speed_mpn_mulmod_2expp1_basecase(struct speed_params *s);
double speed_mpn_mullow_n_basecase(struct speed_params *s);
double speed_mpn_mulmid(struct speed_params *s);
double speed_mpn_mulmid_basecase(struct speed_params *s);
double speed_mpn_mulmid_n(struct speed_params *s);
double speed_mpn_toom42_mulmid(struct speed_params *s);
double speed_mpn_nand_n(struct speed_params *s);
double speed_mpn_nior_n(struct speed_params *s);
double speed_mpn_popcount(struct speed_params *s);
double speed_mpn_preinv_divrem_1(struct speed_params *s);
double speed_mpn_preinv_divrem_1f(struct speed_params *s);
double speed_mpn_preinv_mod_1(struct speed_params *s);
double speed_mpn_binvert (struct speed_params *);
double speed_mpn_redc_1 (struct speed_params *);
double speed_mpn_redc_2 (struct speed_params *);
double speed_mpn_redc_n (struct speed_params *);
double speed_mpn_rsh1add_n(struct speed_params *s);
double speed_mpn_rsh1sub_n(struct speed_params *s);
double speed_mpn_rshift(struct speed_params *s);
double speed_mpn_rshift1(struct speed_params *s);
double speed_mpn_rshift2(struct speed_params *s);
double speed_mpn_set_str(struct speed_params *s);
double speed_mpn_set_str_basecase(struct speed_params *s);
double speed_mpn_set_str_subquad(struct speed_params *s);
double speed_mpn_sqr_basecase(struct speed_params *s);
double speed_mpn_sqr_diagonal(struct speed_params *s);
double speed_mpn_sqr(struct speed_params *s);
double speed_mpn_sqrtrem(struct speed_params *s);
double speed_mpn_rootrem(struct speed_params *s);
double speed_mpn_sub_n(struct speed_params *s);
double speed_mpn_sublsh1_n(struct speed_params *s);
double speed_mpn_submul_1(struct speed_params *s);
double speed_mpn_toom3_mul_n(struct speed_params *s);
double speed_mpn_toom4_mul_n(struct speed_params *s);
double speed_mpn_toom8h_mul(struct speed_params *s);
double speed_mpn_toom3_sqr_n(struct speed_params *s);
double speed_mpn_toom4_sqr_n(struct speed_params *s);
double speed_mpn_toom8_sqr_n(struct speed_params *s);
double speed_mpn_udiv_qrnnd(struct speed_params *s);
double speed_mpn_udiv_qrnnd_r(struct speed_params *s);
double speed_mpn_umul_ppmm(struct speed_params *s);
double speed_mpn_umul_ppmm_r(struct speed_params *s);
double speed_mpn_xnor_n(struct speed_params *s);
double speed_mpn_xor_n(struct speed_params *s);
double speed_MPN_ZERO(struct speed_params *s);
double speed_mpn_store(struct speed_params *s);
double speed_mpq_init_clear(struct speed_params *s);
double speed_mpz_add(struct speed_params *s);
double speed_mpz_bin_uiui(struct speed_params *s);
double speed_mpz_fac_ui(struct speed_params *s);
double speed_mpz_fac_ui_small(struct speed_params *s);
double speed_mpz_fac_ui_large(struct speed_params *s);
double speed_mpz_fib_ui(struct speed_params *s);
double speed_mpz_fib2_ui(struct speed_params *s);
double speed_mpz_init_clear(struct speed_params *s);
double speed_mpz_init_realloc_clear(struct speed_params *s);
double speed_mpz_jacobi(struct speed_params *s);
double speed_mpz_lucnum_ui(struct speed_params *s);
double speed_mpz_lucnum2_ui(struct speed_params *s);
double speed_mpz_mod(struct speed_params *s);
double speed_mpz_powm(struct speed_params *s);
double speed_mpz_powm_mod(struct speed_params *s);
double speed_mpz_powm_redc(struct speed_params *s);
double speed_mpz_powm_ui(struct speed_params *s);
double speed_mpz_urandomb(struct speed_params *s);
double speed_gmp_randseed(struct speed_params *s);
double speed_gmp_randseed_ui(struct speed_params *s);
double speed_noop(struct speed_params *s);
double speed_noop_wxs(struct speed_params *s);
double speed_noop_wxys(struct speed_params *s);
double speed_operator_div(struct speed_params *s);
double speed_operator_mod(struct speed_params *s);
double speed_udiv_qrnnd(struct speed_params *s);
double speed_udiv_qrnnd_preinv1(struct speed_params *s);
double speed_udiv_qrnnd_preinv2(struct speed_params *s);
double speed_udiv_qrnnd_c(struct speed_params *s);
double speed_umul_ppmm(struct speed_params *s);
/* Prototypes for other routines */
/* low 32-bits in p[0], high 32-bits in p[1] */
void speed_cyclecounter(unsigned p[2]);
void mftb_function(unsigned p[2]);
/* In i386 gcc -fPIC, ebx is a fixed register and can't be declared a dummy
output or a clobber for the cpuid, hence an explicit save and restore. A
clobber as such doesn't provoke an error unfortunately (gcc 3.0), so use
the dummy output style in non-PIC, so there's an error if somehow -fPIC
is used without a -DPIC to tell us about it. */
#if defined(__GNUC__) && (defined (__i386__) || defined (__i486__))
#ifdef PIC
#define speed_cyclecounter(p) \
do { \
int __speed_cyclecounter__save_ebx; \
int __speed_cyclecounter__dummy; \
__asm__ __volatile__ ("movl %%ebx, %1\n" \
"cpuid\n" \
"movl %1, %%ebx\n" \
"rdtsc" \
: "=a" ((p)[0]), \
"=&rm" (__speed_cyclecounter__save_ebx), \
"=c" (__speed_cyclecounter__dummy), \
"=d" ((p)[1])); \
} while (0)
#else
#define speed_cyclecounter(p) \
do { \
int __speed_cyclecounter__dummy1; \
int __speed_cyclecounter__dummy2; \
__asm__ __volatile__ ("cpuid\n" \
"rdtsc" \
: "=a" ((p)[0]), \
"=b" (__speed_cyclecounter__dummy1), \
"=c" (__speed_cyclecounter__dummy2), \
"=d" ((p)[1])); \
} while (0)
#endif
#elif defined( _MSC_VER )
#include <intrin.h>
#pragma intrinsic(__rdtsc)
__inline void speed_cyclecounter(unsigned p[2]) { *(unsigned long long*)p = __rdtsc(); }
#endif
double speed_cyclecounter_diff(const unsigned end[2], const unsigned start[2]);
int gettimeofday_microseconds_p(void);
int getrusage_microseconds_p(void);
int cycles_works_p(void);
long clk_tck(void);
double freq_measure(const char *, double (*)(void));
int double_cmp_ptr(const double *p, const double *q);
void pentium_wbinvd(void);
typedef int (*qsort_function_t)(const void *, const void *);
void noop(void);
void noop_1(mp_limb_t n);
void noop_wxs(mp_ptr wp, mp_srcptr xp, mp_size_t size);
void noop_wxys(mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size);
void mpn_cache_fill(mp_srcptr ptr, mp_size_t size);
void mpn_cache_fill_dummy(mp_limb_t n);
void speed_cache_fill(struct speed_params *s);
void speed_operand_src(struct speed_params *s, mp_ptr ptr, mp_size_t size);
void speed_operand_dst(struct speed_params *s, mp_ptr ptr, mp_size_t size);
extern int speed_option_addrs;
extern int speed_option_verbose;
void speed_option_set(const char *s);
mp_limb_t mpn_divrem_1_div(mp_ptr qp, mp_size_t xsize, mp_srcptr ap, mp_size_t size, mp_limb_t d);
mp_limb_t mpn_divrem_1_inv(mp_ptr qp, mp_size_t xsize, mp_srcptr ap, mp_size_t size, mp_limb_t d);
mp_limb_t mpn_divrem_2_div(mp_ptr qp, mp_size_t qxn, mp_ptr np, mp_size_t nsize, mp_srcptr dp);
mp_limb_t mpn_divrem_2_inv(mp_ptr qp, mp_size_t qxn, mp_ptr np, mp_size_t nsize, mp_srcptr dp);
int mpn_jacobi_base_1(mp_limb_t a, mp_limb_t b, int result_bit1);
int mpn_jacobi_base_2(mp_limb_t a, mp_limb_t b, int result_bit1);
int mpn_jacobi_base_3(mp_limb_t a, mp_limb_t b, int result_bit1);
int mpn_jacobi_base_4(mp_limb_t a, mp_limb_t b, int result_bit1);
mp_limb_t mpn_mod_1_div(mp_srcptr ap, mp_size_t size, mp_limb_t d);
mp_limb_t mpn_mod_1_inv(mp_srcptr ap, mp_size_t size, mp_limb_t d);
mp_size_t mpn_gcdext_one_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
mp_size_t mpn_gcdext_one_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
mp_size_t mpn_gcdext_single (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
mp_size_t mpn_gcdext_double (mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t);
mp_size_t mpn_hgcd_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
mp_size_t mpn_hgcd_lehmer_itch (mp_size_t);
mp_size_t mpn_hgcd_appr_lehmer (mp_ptr, mp_ptr, mp_size_t, struct hgcd_matrix *, mp_ptr);
mp_size_t mpn_hgcd_appr_lehmer_itch (mp_size_t);
mp_size_t mpn_hgcd_reduce_1 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
mp_size_t mpn_hgcd_reduce_1_itch (mp_size_t, mp_size_t);
mp_size_t mpn_hgcd_reduce_2 (struct hgcd_matrix *, mp_ptr, mp_ptr, mp_size_t, mp_size_t, mp_ptr);
mp_size_t mpn_hgcd_reduce_2_itch (mp_size_t, mp_size_t);
mp_size_t mpn_set_str_basecase(mp_ptr, const unsigned char *, size_t, int);
mp_size_t mpn_set_str_subquad(mp_ptr, const unsigned char *, size_t, int);
void mpn_toom3_mul_n_open(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_ptr);
void mpn_toom4_mul_n_open(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
void mpn_toom8h_mul_open(mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t);
void mpn_toom3_sqr_n_open(mp_ptr, mp_srcptr, mp_size_t, mp_ptr);
void mpn_toom4_sqr_n_open(mp_ptr, mp_srcptr, mp_size_t, mp_ptr);
void mpn_toom8_sqr_n_open(mp_ptr, mp_srcptr, mp_size_t, mp_ptr);
void mpn_toom3_mul_n_mpn(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t, mp_ptr);
void mpn_toom4_mul_n_mpn(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
void mpn_toom8h_mul_mpn(mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t);
void mpn_toom3_sqr_n_mpn(mp_ptr, mp_srcptr, mp_size_t, mp_ptr);
void mpn_toom4_sqr_n_mpn(mp_ptr, mp_srcptr, mp_size_t, mp_ptr);
void mpn_toom8_sqr_n_mpn(mp_ptr, mp_srcptr, mp_size_t, mp_ptr);
void mpz_powm_mod(mpz_ptr res, mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod);
void mpz_powm_redc(mpz_ptr res, mpz_srcptr base, mpz_srcptr e, mpz_srcptr mod);
void redc(mp_ptr cp, mp_ptr tp, mp_srcptr mp, mp_size_t n, mp_limb_t Nprim);
void mpz_fac_ui_small(mpz_ptr,unsigned long);
void mpz_fac_ui_large(mpz_ptr,unsigned long);
int speed_routine_count_zeros_setup(struct speed_params *s, mp_ptr xp, int leading, int zero);
/* "get" is called repeatedly until it ticks over, just in case on a fast
processor it takes less than a microsecond, though this is probably
unlikely if it's a system call.
speed_cyclecounter is called on the same side of the "get" for the start
and end measurements. It doesn't matter how long it takes from the "get"
sample to the cycles sample, since that period will cancel out in the
difference calculation (assuming it's the same each time).
Letting the test run for more than a process time slice is probably only
going to reduce accuracy, especially for getrusage when the cycle counter
is real time, or for gettimeofday if the cycle counter is in fact process
time. Use CLK_TCK/2 as a reasonable stop.
It'd be desirable to be quite accurate here. The default speed_precision
for a cycle counter is 10000 cycles, so to mix that with getrusage or
gettimeofday the frequency should be at least that accurate. But running
measurements for 10000 microseconds (or more) is too long. Be satisfied
with just a half clock tick (5000 microseconds usually). */
#define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec) \
do { \
type st1, st, et1, et; \
unsigned sc[2], ec[2]; \
long dt, half_tick; \
double dc, cyc; \
\
half_tick = (1000000L / clk_tck()) / 2; \
\
get (st1); \
do { \
get (st); \
} while (usec(st) == usec(st1) && sec(st) == sec(st1)); \
\
getc (sc); \
\
for (;;) \
{ \
get (et1); \
do { \
get (et); \
} while (usec(et) == usec(et1) && sec(et) == sec(et1)); \
\
getc (ec); \
\
dc = speed_cyclecounter_diff (ec, sc); \
\
/* allow secs to cancel before multiplying */ \
dt = sec(et) - sec(st); \
dt = dt * 1000000L + (usec(et) - usec(st)); \
\
if (dt >= half_tick) \
break; \
} \
\
cyc = dt * 1e-6 / dc; \
\
if (speed_option_verbose >= 2) \
printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n", \
name, dc, dt, cyc); \
\
return dt * 1e-6 / dc; \
\
} while (0)
/* The measuring routines use these big macros to save duplication for
similar forms. They also get used for some automatically generated
measuring of new implementations of functions.
Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
function pointer is considered undesirable since it's not the way a
normal application will be calling, and some processors might do
different things with an indirect call, like not branch predicting, or
doing a full pipe flush. At least some of the "functions" measured are
actually macros too.
The net effect is to bloat the object code, possibly in a big way, but
only what's being measured is being run, so that doesn't matter.
The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
ATTRIBUTE_CONST on the called functions. Adding a cast to a non-pure
function pointer doesn't work in gcc 3.2. Using an actual non-pure
function pointer variable works, but stands a real risk of a
non-optimizing compiler generating unnecessary overheads in the call.
Currently the best idea is not to use those attributes for a timing
program build. __GMP_NO_ATTRIBUTE_CONST_PURE will tell mpir.h and
gmp-impl.h to omit them from routines there. */
#define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0;
/* For mpn_copy or similar. */
#define SPEED_ROUTINE_MPN_COPY(function) \
{ \
mp_ptr wp; \
unsigned i; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 0); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
\
speed_operand_src (s, s->xp, s->size); \
speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do \
function (wp, s->xp, s->size); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_COPYC(function) \
{ \
mp_ptr wp; \
unsigned i; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 0); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
\
speed_operand_src (s, s->xp, s->size); \
speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do \
function (wp, s->xp, s->size, 0); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
/* s->size is still in limbs, and it's limbs which are copied, but
"function" takes a size in bytes not limbs. */
#define SPEED_ROUTINE_MPN_COPY_BYTES(function) \
{ \
mp_ptr wp; \
unsigned i; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 0); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
\
speed_operand_src (s, s->xp, s->size); \
speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do \
function (wp, s->xp, s->size * BYTES_PER_MP_LIMB); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
/* For mpn_add_n, mpn_sub_n, or similar. */
#define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \
{ \
mp_ptr wp; \
mp_ptr xp, yp; \
unsigned i; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 1); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
\
xp = s->xp; \
yp = s->yp; \
\
if (s->r == 0) ; \
else if (s->r == 1) { xp = wp; } \
else if (s->r == 2) { yp = wp; } \
else if (s->r == 3) { xp = wp; yp = wp; } \
else if (s->r == 4) { yp = xp; } \
else { \
TMP_FREE; \
return -1.0; \
} \
\
/* initialize wp if operand overlap */ \
if (xp == wp || yp == wp) \
MPN_COPY (wp, s->xp, s->size); \
\
speed_operand_src (s, xp, s->size); \
speed_operand_src (s, yp, s->size); \
speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do \
call; \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
/* For mpn_add_n, mpn_sub_n, or similar. */
#define SPEED_ROUTINE_MPN_SUMDIFF_N_CALL(call) \
{ \
mp_ptr ap, sp; \
mp_ptr xp, yp; \
unsigned i; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 1); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp); \
SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp); \
\
xp = s->xp; \
yp = s->yp; \
\
if ((s->r & 1) != 0) { xp = ap; } \
if ((s->r & 2) != 0) { yp = ap; } \
if ((s->r & 4) != 0) { xp = sp; } \
if ((s->r & 8) != 0) { yp = sp; } \
if ((s->r & 3) == 3 || (s->r & 12) == 12) \
{ \
TMP_FREE; \
return -1.0; \
} \
\
/* initialize ap if operand overlap */ \
if (xp == ap || yp == ap) \
MPN_COPY (ap, s->xp, s->size); \
/* initialize sp if operand overlap */ \
if (xp == sp || yp == sp) \
MPN_COPY (sp, s->xp, s->size); \
\
speed_operand_src (s, xp, s->size); \
speed_operand_src (s, yp, s->size); \
speed_operand_dst (s, ap, s->size); \
speed_operand_dst (s, sp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do \
call; \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
// for addadd or addsub or subadd
#define SPEED_ROUTINE_MPN_TRINARY_N(call) \
{ \
mp_ptr ap, sp; \
mp_ptr xp, yp; \
unsigned i; \
double t; \
TMP_DECL; \
SPEED_RESTRICT_COND (s->size >= 1); \
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp); \
SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp2); \
xp = s->xp; \
yp = s->yp; \
speed_operand_src (s, xp, s->size); \
speed_operand_src (s, yp, s->size); \
speed_operand_src (s, sp, s->size); \
speed_operand_dst (s, ap, s->size); \
speed_cache_fill (s); \
speed_starttime (); \
i = s->reps; \
do \
call(ap,sp,xp,yp,s->size); \
while (--i != 0); \
t = speed_endtime (); \
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_ADD_ERR1_N(call) \
{ \
mp_ptr ap, sp; \
mp_ptr xp, yp; \
mp_limb_t ep[2]; \
unsigned i; \
double t; \
TMP_DECL; \
SPEED_RESTRICT_COND (s->size >= 1); \
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp); \
SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp2); \
xp = s->xp; \
yp = s->yp; \
speed_operand_src (s, xp, s->size); \
speed_operand_src (s, yp, s->size); \
speed_operand_src (s, sp, s->size); \
speed_operand_dst (s, ap, s->size); \
speed_cache_fill (s); \
speed_starttime (); \
i = s->reps; \
do \
call(ap,sp,xp,ep,yp,s->size,0); \
while (--i != 0); \
t = speed_endtime (); \
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_BINARY_N(function) \
SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
#define SPEED_ROUTINE_MPN_BINARY_NC(function) \
SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
/* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
#define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \
{ \
mp_ptr wp; \
unsigned i; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 1); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
\
speed_operand_src (s, s->xp, s->size); \
speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do \
call; \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_SHIFTX(call) \
{ \
mp_ptr wp; \
unsigned i; \
double t; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 1); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
\
speed_operand_src (s, s->xp, s->size); \
speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do \
call(wp,s->xp,s->size); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_UNARY_1(function) \
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
#define SPEED_ROUTINE_MPN_UNARY_1C(function) \
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
/* FIXME: wp is uninitialized here, should start it off from xp */
#define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
#define SPEED_ROUTINE_MPN_DIVEXACT_1(function) \
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
#define SPEED_ROUTINE_MPN_DIVEXACT_BYFOBM1(function) \
{ \
mp_ptr wp; \
unsigned i; \
double t; \
mp_limb_t inv=MP_LIMB_T_MAX/s->r; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= 1); \
\
TMP_MARK; \
SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
\
speed_operand_src (s, s->xp, s->size); \
speed_operand_dst (s, wp, s->size); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do \
(*function) (wp, s->xp, s->size, s->r,inv); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}
#define SPEED_ROUTINE_MPN_DIVREM_1(function) \
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
#define SPEED_ROUTINE_MPN_DIVREM_1C(function) \
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
#define SPEED_ROUTINE_MPN_DIVREM_1F(function) \
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
#define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \
SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call) \
{ \
unsigned shift; \
mp_limb_t dinv; \
\
SPEED_RESTRICT_COND (s->size >= 0); \
SPEED_RESTRICT_COND (s->r != 0); \
\
count_leading_zeros (shift, s->r); \
invert_limb (dinv, s->r << shift); \
\
SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \
} \
#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function) \
SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \
((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
/* s->size limbs worth of fraction part */
#define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function) \
SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \
((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
/* s->r is duplicated to form the multiplier, defaulting to
MP_BASES_BIG_BASE_10. Not sure if that's particularly useful, but at
least it provides some control. */
#define SPEED_ROUTINE_MPN_UNARY_N(function,N) \
{ \
mp_ptr wp; \
mp_size_t wn; \
unsigned i; \
double t; \
mp_limb_t yp[N]; \
TMP_DECL; \
\
SPEED_RESTRICT_COND (s->size >= N); \
\
TMP_MARK; \
wn = s->size + N-1; \
SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \
for (i = 0; i < N; i++) \
yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10); \
\
speed_operand_src (s, s->xp, s->size); \
speed_operand_src (s, yp, (mp_size_t) N); \
speed_operand_dst (s, wp, wn); \
speed_cache_fill (s); \
\
speed_starttime (); \
i = s->reps; \
do \
function (wp, s->xp, s->size, yp); \
while (--i != 0); \
t = speed_endtime (); \
\
TMP_FREE; \
return t; \
}