forked from gabr42/FastMM4-MP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
FastMM4LockFreeStack.pas
352 lines (319 loc) · 10.1 KB
/
FastMM4LockFreeStack.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
// Based on TOmniBaseBoundedStack class from the OmniThreadLibrary,
// originally written by GJ and Primoz Gabrijelcic.
unit FastMM4LockFreeStack;
interface
type
PReferencedPtr = ^TReferencedPtr;
TReferencedPtr = record
PData : pointer;
Reference: NativeInt;
end;
PLinkedData = ^TLinkedData;
TLinkedData = packed record
Next: PLinkedData;
Data: record end; //user data, variable size
end;
TLFStack = record
strict private
FDataBuffer : pointer;
FElementSize : integer;
FNumElements : integer;
FPublicChainP : PReferencedPtr;
FRecycleChainP: PReferencedPtr;
class var
class var obsIsInitialized: boolean; //default is false
class var obsTaskPopLoops : NativeInt;
class var obsTaskPushLoops: NativeInt;
class function PopLink(var chain: TReferencedPtr): PLinkedData; static;
class procedure PushLink(const link: PLinkedData; var chain: TReferencedPtr); static;
procedure MeasureExecutionTimes;
public
procedure Empty;
procedure Initialize(numElements, elementSize: integer);
procedure Finalize;
function IsEmpty: boolean; inline;
function IsFull: boolean; inline;
function Pop(var value): boolean;
function Push(const value): boolean;
property ElementSize: integer read FElementSize;
property NumElements: integer read FNumElements;
end;
implementation
uses
Windows;
{$IF CompilerVersion < 23}
{$IFNDEF CPUX64}
type
NativeInt = integer;
NativeUInt = cardinal;
{$ENDIF}
{$IFEND}
var
CASAlignment: integer; //required alignment for the CAS function - 8 or 16, depending on the platform
function RoundUpTo(value: pointer; granularity: integer): pointer;
begin
Result := pointer((((NativeInt(value) - 1) div granularity) + 1) * granularity);
end;
function GetCPUTimeStamp: int64;
asm
rdtsc
{$IFDEF CPUX64}
shl rdx, 32
or rax, rdx
{$ENDIF CPUX64}
end;
function GetThreadId: NativeInt;
//result := GetCurrentThreadId;
asm
{$IFNDEF CPUX64}
mov eax, fs:[$18] //eax := thread information block
mov eax, [eax + $24] //eax := thread id
{$ELSE CPUX64}
mov rax, gs:[abs $30]
mov eax, [rax + $48]
{$ENDIF CPUX64}
end;
function CAS(const oldValue, newValue: NativeInt; var destination): boolean; overload;
asm
{$IFDEF CPUX64}
mov rax, oldValue
{$ENDIF CPUX64}
lock cmpxchg [destination], newValue
setz al
end;
function CAS(const oldValue, newValue: pointer; var destination): boolean; overload;
asm
{$IFDEF CPUX64}
mov rax, oldValue
{$ENDIF CPUX64}
lock cmpxchg [destination], newValue
setz al
end;
function CAS(const oldData: pointer; oldReference: NativeInt; newData: pointer;
newReference: NativeInt; var destination): boolean; overload;
asm
{$IFNDEF CPUX64}
push edi
push ebx
mov ebx, newData
mov ecx, newReference
mov edi, destination
lock cmpxchg8b qword ptr [edi]
pop ebx
pop edi
{$ELSE CPUX64}
.noframe
push rbx //rsp := rsp - 8 !
mov rax, oldData
mov rbx, newData
mov rcx, newReference
mov r8, [destination + 8] //+8 with respect to .noframe
lock cmpxchg16b [r8]
pop rbx
{$ENDIF CPUX64}
setz al
end;
{ TLFStack }
procedure TLFStack.Empty;
var
linkedData: PLinkedData;
begin
repeat
linkedData := PopLink(FPublicChainP^);
if not assigned(linkedData) then
break; //repeat
PushLink(linkedData, FRecycleChainP^);
until false;
end;
procedure TLFStack.Finalize;
begin
HeapFree(GetProcessHeap, 0, FDataBuffer);
end;
procedure TLFStack.Initialize(numElements, elementSize: integer);
var
bufferElementSize : integer;
currElement : PLinkedData;
dataBuffer : pointer;
iElement : integer;
nextElement : PLinkedData;
roundedElementSize: integer;
begin
Assert(SizeOf(NativeInt) = SizeOf(pointer));
Assert(numElements > 0);
Assert(elementSize > 0);
FNumElements := numElements;
FElementSize := elementSize;
//calculate element size, round up to next aligned value
roundedElementSize := (elementSize + SizeOf(pointer) - 1) AND NOT (SizeOf(pointer) - 1);
//calculate buffer element size, round up to next aligned value
bufferElementSize := ((SizeOf(TLinkedData) + roundedElementSize) + SizeOf(pointer) - 1) AND NOT (SizeOf(pointer) - 1);
//calculate DataBuffer
FDataBuffer := HeapAlloc(GetProcessHeap, HEAP_GENERATE_EXCEPTIONS, bufferElementSize * numElements + 2 * SizeOf(TReferencedPtr) + CASAlignment);
dataBuffer := RoundUpTo(FDataBuffer, CASAlignment);
if NativeInt(dataBuffer) AND (SizeOf(pointer) - 1) <> 0 then
// TODO 1 raise exception - how?
Halt; //raise Exception.Create('TOmniBaseContainer: obcBuffer is not aligned');
FPublicChainP := dataBuffer;
inc(NativeInt(dataBuffer), SizeOf(TReferencedPtr));
FRecycleChainP := dataBuffer;
inc(NativeInt(dataBuffer), SizeOf(TReferencedPtr));
//Format buffer to recycleChain, init obsRecycleChain and obsPublicChain.
//At the beginning, all elements are linked into the recycle chain.
FRecycleChainP^.PData := dataBuffer;
currElement := FRecycleChainP^.PData;
for iElement := 0 to FNumElements - 2 do begin
nextElement := PLinkedData(NativeInt(currElement) + bufferElementSize);
currElement.Next := nextElement;
currElement := nextElement;
end;
currElement.Next := nil; // terminate the chain
FPublicChainP^.PData := nil;
MeasureExecutionTimes;
end;
function TLFStack.IsEmpty: boolean;
begin
Result := not assigned(FPublicChainP^.PData);
end;
function TLFStack.IsFull: boolean;
begin
Result := not assigned(FRecycleChainP^.PData);
end;
procedure TLFStack.MeasureExecutionTimes;
const
NumOfSamples = 10;
var
TimeTestField: array [0..1] of array [1..NumOfSamples] of int64;
function GetMinAndClear(routine, count: cardinal): int64;
var
m: cardinal;
n: integer;
x: integer;
begin
Result := 0;
for m := 1 to count do begin
x:= 1;
for n:= 2 to NumOfSamples do
if TimeTestField[routine, n] < TimeTestField[routine, x] then
x := n;
Inc(Result, TimeTestField[routine, x]);
TimeTestField[routine, x] := MaxLongInt;
end;
end;
var
oldAffinity: NativeUInt;
currElement: PLinkedData;
n : integer;
begin
if not obsIsInitialized then begin
oldAffinity := SetThreadAffinityMask(GetCurrentThread, 1);
try
//Calculate TaskPopDelay and TaskPushDelay counter values depend on CPU speed!!!}
obsTaskPopLoops := 1;
obsTaskPushLoops := 1;
for n := 1 to NumOfSamples do begin
SwitchToThread;
//Measure RemoveLink rutine delay
TimeTestField[0, n] := GetCPUTimeStamp;
currElement := PopLink(FRecycleChainP^);
TimeTestField[0, n] := GetCPUTimeStamp - TimeTestField[0, n];
//Measure InsertLink rutine delay
TimeTestField[1, n] := GetCPUTimeStamp;
PushLink(currElement, FRecycleChainP^);
TimeTestField[1, n] := GetCPUTimeStamp - TimeTestField[1, n];
end;
//Calculate first 4 minimum average for RemoveLink rutine
obsTaskPopLoops := GetMinAndClear(0, 4) div 4;
//Calculate first 4 minimum average for InsertLink rutine
obsTaskPushLoops := GetMinAndClear(1, 4) div 4;
//This gives better performance (determined experimentally)
obsTaskPopLoops := obsTaskPopLoops * 2;
obsTaskPushLoops := obsTaskPushLoops * 2;
obsIsInitialized := true;
finally SetThreadAffinityMask(GetCurrentThread, oldAffinity); end;
end;
end;
function TLFStack.Pop(var value): boolean;
var
linkedData: PLinkedData;
begin
linkedData := PopLink(FPublicChainP^);
Result := assigned(linkedData);
if not Result then
Exit;
Move(linkedData.Data, value, ElementSize);
PushLink(linkedData, FRecycleChainP^);
end;
class function TLFStack.PopLink(var chain: TReferencedPtr): PLinkedData;
//nil << Link.Next << Link.Next << ... << Link.Next
// ^------ < chainHead
var
AtStartReference: NativeInt;
CurrentReference: NativeInt;
TaskCounter : NativeInt;
ThreadReference : NativeInt;
label
TryAgain;
begin
ThreadReference := GetThreadId + 1; //Reference.bit0 := 1
with chain do begin
TryAgain:
TaskCounter := obsTaskPopLoops;
AtStartReference := Reference OR 1; //Reference.bit0 := 1
repeat
CurrentReference := Reference;
Dec(TaskCounter);
until (TaskCounter = 0) or (CurrentReference AND 1 = 0);
if (CurrentReference AND 1 <> 0) and (AtStartReference <> CurrentReference) or
not CAS(CurrentReference, ThreadReference, Reference)
then
goto TryAgain;
//Reference is set...
Result := PData;
//Empty test
if result = nil then
CAS(ThreadReference, 0, Reference) //Clear Reference if task own reference
else if not CAS(Result, ThreadReference, Result.Next, 0, chain) then
goto TryAgain;
end; //with chain
end;
function TLFStack.Push(const value): boolean;
var
linkedData: PLinkedData;
begin
linkedData := PopLink(FRecycleChainP^);
Result := assigned(linkedData);
if not Result then
Exit;
Move(value, linkedData.Data, ElementSize);
PushLink(linkedData, FPublicChainP^);
end;
class procedure TLFStack.PushLink(const link: PLinkedData; var chain: TReferencedPtr);
var
PMemData : pointer;
TaskCounter: NativeInt;
begin
with chain do begin
for TaskCounter := 0 to obsTaskPushLoops do
if (Reference AND 1 = 0) then
break;
repeat
PMemData := PData;
link.Next := PMemData;
until CAS(PMemData, link, PData);
end;
end;
procedure InitializeTimingInfo;
var
stack: TLFStack;
begin
stack.Initialize(10, 4); // enough for initialization
stack.Finalize;
end;
initialization
{$IFDEF CPUX64}
CASAlignment := 16;
{$ELSE}
CASAlignment := 8;
{$ENDIF CPUX64}
InitializeTimingInfo;
end.