forked from ucsd-thesis/ucsd-thesis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dmx.bib
7572 lines (7545 loc) · 361 KB
/
dmx.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@inproceedings{dmx:hpca:2024,
author={Wang, Shu-Ting and Xu, Hanyang and Mamandipoor, Amin and Mahapatra, Rohan and Ahn, Byung Hoon and Ghodrati, Soroush and Kailas, Krishnan and Alian, Mohammad and Esmaeilzadeh, Hadi},
booktitle={2024 IEEE International Symposium on High-Performance Computer Architecture (HPCA)},
title={Data Motion Acceleration: Chaining Cross-Domain Multi Accelerators},
year={2024},
volume={},
number={},
pages={1043-1062},
}
@inproceedings{accelerator-cluster:hoti:2023,
author = {Bill Dally},
title = {Accelerator Clusters: the New Supercomputer},
year = {2023},
booktitle = {HOTI}
}
@inproceedings{q100:asplos:2014,
author = {Wu, Lisa and Lottarini, Andrea and Paine, Timothy K. and Kim, Martha A. and Ross, Kenneth A.},
title = {Q100: The Architecture and Design of a Database Processing Unit},
year = {2014},
isbn = {9781450323055},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2541940.2541961},
doi = {10.1145/2541940.2541961},
abstract = {In this paper, we propose Database Processing Units, or DPUs, a class of domain-specific database processors that can efficiently handle database applications. As a proof of concept, we present the instruction set architecture, microarchitecture, and hardware implementation of one DPU, called Q100. The Q100 has a collection of heterogeneous ASIC tiles that process relational tables and columns quickly and energy-efficiently. The architecture uses coarse grained in- structions that manipulate streams of data, thereby maximizing pipeline and data parallelism, and minimizing the need to time multiplex the accelerator tiles and spill inter- mediate results to memory. This work explores a Q100 de- sign space of 150 configurations, selecting three for further analysis: a small, power-conscious implementation, a high- performance implementation, and a balanced design that maximizes performance per Watt. We then demonstrate that the power-conscious Q100 handles the TPC-H queries with three orders of magnitude less energy than a state of the art software DBMS, while the performance-oriented design out- performs the same DBMS by 70X.},
booktitle = {Proceedings of the 19th International Conference on Architectural Support for Programming Languages and Operating Systems},
pages = {255–268},
numpages = {14},
keywords = {dpu, accelerator, microarchitecture, streaming data, database, specialized functional unit},
location = {Salt Lake City, Utah, USA},
series = {ASPLOS '14}
}
@inproceedings{meet-the-walkers:isca:2013,
author = {Kocberber, Onur and Grot, Boris and Picorel, Javier and Falsafi, Babak and Lim, Kevin and Ranganathan, Parthasarathy},
title = {Meet the Walkers: Accelerating Index Traversals for in-Memory Databases},
year = {2013},
isbn = {9781450326384},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2540708.2540748},
doi = {10.1145/2540708.2540748},
abstract = {The explosive growth in digital data and its growing role in real-time decision support motivate the design of high-performance database management systems (DBMSs). Meanwhile, slowdown in supply voltage scaling has stymied improvements in core performance and ushered an era of power-limited chips. These developments motivate the design of DBMS accelerators that (a) maximize utility by accelerating the dominant operations, and (b) provide flexibility in the choice of DBMS, data layout, and data types.We study data analytics workloads on contemporary in-memory databases and find hash index lookups to be the largest single contributor to the overall execution time. The critical path in hash index lookups consists of ALU-intensive key hashing followed by pointer chasing through a node list. Based on these observations, we introduce Widx, an on-chip accelerator for database hash index lookups, which achieves both high performance and flexibility by (1) decoupling key hashing from the list traversal, and (2) processing multiple keys in parallel on a set of programmable walker units. Widx reduces design cost and complexity through its tight integration with a conventional core, thus eliminating the need for a dedicated TLB and cache. An evaluation of Widx on a set of modern data analytics workloads (TPC-H, TPC-DS) using full-system simulation shows an average speedup of 3.1x over an aggressive OoO core on bulk hash table operations, while reducing the OoO core energy by 83\%.},
booktitle = {Proceedings of the 46th Annual IEEE/ACM International Symposium on Microarchitecture},
pages = {468–479},
numpages = {12},
keywords = {energy efficiency, database indexing, hardware accelerators},
location = {Davis, California},
series = {MICRO-46}
}
@inproceedings{mahapatra:mlarchsys:2022,
title = {Exploring Efficient ML-based Scheduler for Microservices in Heterogenous Clusters},
author = {Mahapatra, Rohan and Ahn, Byung Hoon and Wang, Shu-Ting and Xu, Hanyang and Esmaeilzadeh, Hadi},
booktitle = {Machine Learning for Computer Architecture and Systems 2022},
year = {2022}
}
@misc{intel-cascade-lake,
title = {Intel Cascade Lake},
url = {https://ark.intel.com/content/www/us/en/ark/products/192447/intel-xeon-gold-6252-processor-35-75m-cache-2-10-ghz.html}
}
@misc{intel-ice-lake,
title = {Intel Ice Lake},
url = {https://ark.intel.com/content/www/us/en/ark/products/212456/intel-xeon-gold-6348-processor-42m-cache-2-60-ghz.html}
}
@misc{intel-sapphire-rapids,
title = {Intel Sapphire Rapids},
url = {https://ark.intel.com/content/www/us/en/ark/products/231750/intel-xeon-platinum-8468h-processor-105m-cache-2-10-ghz.html}
}
@inproceedings{top-down:ispass:2014,
author = {Yasin, Ahmad},
booktitle = {2014 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)},
title = {A Top-Down method for performance analysis and counters architecture},
year = {2014},
volume = {},
number = {},
pages = {35-44}
}
@inproceedings{in-network-compute:eurosys:2019,
author = {Tokusashi, Yuta and Dang, Huynh Tu and Pedone, Fernando and Soul\'{e}, Robert and Zilberman, Noa},
title = {The Case For In-Network Computing On Demand},
year = {2019},
optabstract = {Programmable network hardware can run services traditionally deployed on servers, resulting in orders-of-magnitude improvements in performance. Yet, despite these performance improvements, network operators remain skeptical of in-network computing. The conventional wisdom is that the operational costs from increased power consumption outweigh any performance benefits. Unless in-network computing can justify its costs, it will be disregarded as yet another academic exercise.In this paper, we challenge that assumption, by providing a detailed power analysis of several in-network computing use cases. Our experiments show that in-network computing can be extremely power-efficient. In fact, for a single watt, a software system on commodity CPU can be improved by a factor of x100 using an FPGA, and a factor of x1000 utilizing ASIC implementations. However, this efficiency depends on the system load. To address changing workloads, we propose in-network computing on demand, where services can be dynamically moved between servers and the network. By shifting the placement of services on-demand, data centers can optimize for both performance and power efficiency.},
booktitle = {Proceedings of the Fourteenth EuroSys Conference 2019},
optseries = {EuroSys '19}
}
@inproceedings{in-network-compute:hotnets:2017,
author = {Sapio, Amedeo and Abdelaziz, Ibrahim and Aldilaijan, Abdulla and Canini, Marco and Kalnis, Panos},
title = {In-Network Computation is a Dumb Idea Whose Time Has Come},
year = {2017},
abstract = {Programmable data plane hardware creates new opportunities for infusing intelligence into the network. This raises a fundamental question: what kinds of computation should be delegated to the network?In this paper, we discuss the opportunities and challenges for co-designing data center distributed systems with their network layer. We believe that the time has finally come for offloading part of their computation to execute in-network. However, in-network computation tasks must be judiciously crafted to match the limitations of the network machine architecture of programmable devices. With the help of our experiments on machine learning and graph analytics workloads, we identify that aggregation functions raise opportunities to exploit the limited computation power of networking hardware to lessen network congestion and improve the overall application performance. Moreover, as a proof-of-concept, we propose Daiet, a system that performs in-network data aggregation. Experimental results with an initial prototype show a large data reduction ratio (86.9\%-89.3\%) and a similar decrease in the workers' computation time.},
booktitle = {Proceedings of the 16th ACM Workshop on Hot Topics in Networks}
}
@inproceedings{drmt:sigcomm:2017,
author = {Chole, Sharad and Fingerhut, Andy and Ma, Sha and Sivaraman, Anirudh and Vargaftik, Shay and Berger, Alon and Mendelson, Gal and Alizadeh, Mohammad and Chuang, Shang-Tse and Keslassy, Isaac and Orda, Ariel and Edsall, Tom},
title = {DRMT: Disaggregated Programmable Switching},
year = {2017},
optabstract = {We present dRMT (disaggregated Reconfigurable Match-Action Table), a new architecture for programmable switches. dRMT overcomes two important restrictions of RMT, the predominant pipeline-based architecture for programmable switches: (1) table memory is local to an RMT pipeline stage, implying that memory not used by one stage cannot be reclaimed by another, and (2) RMT is hardwired to always sequentially execute matches followed by actions as packets traverse pipeline stages. We show that these restrictions make it difficult to execute programs efficiently on RMT.dRMT resolves both issues by disaggregating the memory and compute resources of a programmable switch. Specifically, dRMT moves table memories out of pipeline stages and into a centralized pool that is accessible through a crossbar. In addition, dRMT replaces RMT's pipeline stages with a cluster of processors that can execute match and action operations in any order.We show how to schedule a P4 program on dRMT at compile time to guarantee deterministic throughput and latency. We also present a hardware design for dRMT and analyze its feasibility and chip area. Our results show that dRMT can run programs at line rate with fewer processors compared to RMT, and avoids performance cliffs when there are not enough processors to run a program at line rate. dRMT's hardware design incurs a modest increase in chip area relative to RMT, mainly due to the crossbar.},
booktitle = {Proceedings of the Conference of the ACM Special Interest Group on Data Communication},
optkeywords = {disagreggation, RMT, Programmable switching, packet processing},
optseries = {SIGCOMM '17}
}
@inproceedings{rmt:sigcomm:2013,
author = {Bosshart, Pat and Gibb, Glen and Kim, Hun-Seok and Varghese, George and McKeown, Nick and Izzard, Martin and Mujica, Fernando and Horowitz, Mark},
title = {Forwarding Metamorphosis: Fast Programmable Match-Action Processing in Hardware for SDN},
year = {2013},
optabstract = {In Software Defined Networking (SDN) the control plane is physically separate from the forwarding plane. Control software programs the forwarding plane (e.g., switches and routers) using an open interface, such as OpenFlow. This paper aims to overcomes two limitations in current switching chips and the OpenFlow protocol: i) current hardware switches are quite rigid, allowing ``Match-Action'' processing on only a fixed set of fields, and ii) the OpenFlow specification only defines a limited repertoire of packet processing actions. We propose the RMT (reconfigurable match tables) model, a new RISC-inspired pipelined architecture for switching chips, and we identify the essential minimal set of action primitives to specify how headers are processed in hardware. RMT allows the forwarding plane to be changed in the field without modifying hardware. As in OpenFlow, the programmer can specify multiple match tables of arbitrary width and depth, subject only to an overall resource limit, with each table configurable for matching on arbitrary fields. However, RMT allows the programmer to modify all header fields much more comprehensively than in OpenFlow. Our paper describes the design of a 64 port by 10 Gb/s switch chip implementing the RMT model. Our concrete design demonstrates, contrary to concerns within the community, that flexible OpenFlow hardware switch implementations are feasible at almost no additional cost or power.},
booktitle = {Proceedings of the Conference of the ACM Special Interest Group on Data Communication},
optkeywords = {sdn, reconfigurable match tables, rmt model},
optseries = {SIGCOMM '13}
}
@misc{odsa-bow-spec,
title = {{ODSA}-{BoW} Specifications},
url = {https://opencomputeproject.github.io/ODSA-BoW/bow_specification.html}
}
@misc{ucie-spec,
title = {{UCIe} 1.1 Specifications},
url = {https://www.uciexpress.org/specifications}
}
@inproceedings{amd-chiplet:isca:2021,
author = {Naffziger, Samuel and Beck, Noah and Burd, Thomas and Lepak, Kevin and Loh, Gabriel H. and Subramony, Mahesh and White, Sean},
title = {Pioneering Chiplet Technology and Design for the AMD EPYC™ and Ryzen™ Processor Families},
year = {2021},
optabstract = {For decades, Moore's Law has delivered the ability to integrate an exponentially increasing number of devices in the same silicon area at a roughly constant cost. This has enabled tremendous levels of integration, where the capabilities of computer systems that previously occupied entire rooms can now fit on a single integrated circuit.In recent times, the steady drum beat of Moore's Law has started to slow down. Whereas device density historically doubled every 18--24 months, the rate of recent silicon process advancements has declined. While improvements in device scaling continue, albeit at a reduced pace, the industry is simultaneously observing increases in manufacturing costs.In response, the industry is now seeing a trend toward reversing direction on the traditional march toward more integration. Instead, multiple industry and academic groups are advocating that systems on chips (SoCs) be "disintegrated" into multiple smaller "chiplets." This paper details the technology challenges that motivated AMD to use chiplets, the technical solutions we developed for our products, and how we expanded the use of chiplets from individual processors to multiple product families.},
booktitle = {Proceedings of the 48th Annual International Symposium on Computer Architecture},
optkeywords = {chiplets, moore's law, processors, industry, modular},
optseries = {ISCA '21}
}
@inproceedings{ibm-telum-processor:isca:2022,
author = {Lichtenau, Cedric and Buyuktosunoglu, Alper and Bertran, Ramon and Figuli, Peter and Jacobi, Christian and Papandreou, Nikolaos and Pozidis, Haris and Saporito, Anthony and Sica, Andrew and Tzortzatos, Elpida},
title = {{AI} Accelerator on {IBM Telum} Processor: Industrial Product},
year = {2022},
optabstract = {IBM Telum is the next generation processor chip for IBM Z and LinuxONE systems. The Telum design is focused on enterprise class workloads and it achieves over 40\% per socket performance growth compared to IBM z15. The IBM Telum is the first server-class chip with a dedicated on-chip AI accelerator that enables clients to gain real time insights from their data as it is getting processed.Seamlessly infusing AI in all enterprise workloads is highly desirable to get real business insight on every transaction as well as to improve IT operation, security, and data privacy. While it would undeniably provide significant additional value, its application in practice is often accompanied by hurdles from low throughput if run on-platform to security concerns and inconsistent latency if run off-platform. The IBM Telum chip introduces an on-chip AI accelerator that provides consistent low latency and high throughput (over 200 TFLOPS in 32 chip system) inference capacity usable by all threads. The accelerator is memory coherent and directly connected to the fabric like any other general-purpose core to support low latency inference while meeting the system's transaction rate. A scalable architecture providing transparent access to AI accelerator functions via a non-privileged general-purpose core instruction further reduces software orchestration and library complexity as well as provides extensibility to the AI functions. On a global bank customer credit card fraud detection model, the AI accelerator achieves 22\texttimes{} speed up in latency compared to a general purpose core utilizing vector execution units. For the same model, the AI accelerator achieves 116k inferences every second with a latency of only 1.1 msec. As the system is scaled up from one chip to 32 chips, it performs more than 3.5 Million inferences/sec and the latency still stays very low at only 1.2 msec.This paper briefly introduces the IBM Telum chip and later describes the integrated AI accelerator. IBM Telum's AI accelerator architecture, microarchitecture, integration into the system stack, performance, and power are covered in detail.},
booktitle = {Proceedings of the 49th Annual International Symposium on Computer Architecture},
pages = {1012–1028},
optkeywords = {z16, AI on server-class processor, low-latency in-transaction inference, Telum, enterprise workload AI, on-chip AI accelerator},
optlocation = {New York, New York},
optseries = {ISCA '22}
}
@inproceedings{horowitz:isscc:2014,
author = {Horowitz, Mark},
booktitle = {2014 IEEE International Solid-State Circuits Conference Digest of Technical Papers (ISSCC)},
title = {1.1 Computing's energy problem (and what we can do about it)},
year = {2014},
volume = {},
number = {},
pages = {10-14}
}
@inproceedings{basejump:dac:2018,
author = {Taylor, Michael Bedford},
booktitle = {2018 55th ACM/ESDA/IEEE Design Automation Conference (DAC)},
title = {INVITED: BaseJump STL: SystemVerilog Needs a Standard Template Library for Hardware Design},
year = {2018}
}
@article{blackparrot:ieee-micro:2020,
author = {Petrisko, Daniel and Gilani, Farzam and Wyse, Mark and Jung, Dai Cheol and Davidson, Scott and Gao, Paul and Zhao, Chun and Azad, Zahra and Canakci, Sadullah and Veluri, Bandhav and Guarino, Tavio and Joshi, Ajay and Oskin, Mark and Taylor, Michael Bedford},
journal = {IEEE Micro},
title = {BlackParrot: An Agile Open-Source RISC-V Multicore for Accelerator SoCs},
year = {2020},
volume = {40},
number = {4},
pages = {93-102}
}
@article{democratizing:cacm:2022,
author = {Chi, Yuze and Qiao, Weikang and Sohrabizadeh, Atefeh and Wang, Jie and Cong, Jason},
title = {Democratizing Domain-Specific Computing},
year = {2022},
volume = {66},
number = {1},
abstract = {Creating a programming environment and compilation flow that empowers programmers to create their own DSAs efficiently and affordably on FPGAs.},
journal = {Commun. ACM},
month = {dec},
pages = {74–85}
}
@inproceedings{profiling:isca:2023,
author = {Gonzalez, Abraham and Kolli, Aasheesh and Khan, Samira and Liu, Sihang and Dadu, Vidushi and Karandikar, Sagar and Chang, Jichuan and Asanovic, Krste and Ranganathan, Parthasarathy},
title = {Profiling Hyperscale Big Data Processing},
year = {2023},
optabstract = {Computing demand continues to grow exponentially, largely driven by "big data" processing on hyperscale data stores. At the same time, the slowdown in Moore's law is leading the industry to embrace custom computing in large-scale systems. Taken together, these trends motivate the need to characterize live production traffic on these large data processing platforms and understand the opportunity of acceleration at scale.This paper addresses this key need. We characterize three important production distributed database and data analytics platforms at Google to identify key hardware acceleration opportunities and perform a comprehensive limits study to understand the trade-offs among various hardware acceleration strategies.We observe that hyperscale data processing platforms spend significant time on distributed storage and other remote work across distributed workers. Therefore, optimizing storage and remote work in addition to compute acceleration is critical for these platforms. We present a detailed breakdown of the compute-intensive functions in these platforms and identify dominant key data operations related to datacenter and systems taxes. We observe that no single accelerator can provide a significant benefit but collectively, a sea of accelerators, can accelerate many of these smaller platform-specific functions. We demonstrate the potential gains of the sea of accelerators proposal in a limits study and analytical model. We perform a comprehensive study to understand the trade-offs between accelerator location (on-chip/off-chip) and invocation model (synchronous/asynchronous). We propose and evaluate a chained accelerator execution model where identified compute-intensive functions are accelerated and pipelined to avoid invocation from the core, achieving a 3x improvement over the baseline system while nearly matching identical performance to an ideal fully asynchronous execution model.},
booktitle = {Proceedings of the 50th Annual International Symposium on Computer Architecture},
optseries = {ISCA '23}
}
@article{pymtl3:ieee-micro:2020,
author = {Jiang, Shunning and Pan, Peitian and Ou, Yanghui and Batten, Christopher},
journal = {IEEE Micro},
title = {PyMTL3: A Python Framework for Open-Source Hardware Modeling, Generation, Simulation, and Verification},
year = {2020},
volume = {40},
number = {4},
pages = {58-66}
}
@inproceedings{arc:dac:2012,
title = {Architecture Support for Accelerator-Rich CMPs},
author = {Cong, Jason and Ghodrat, Mohammad Ali and Gill, Michael and Grigorian, Beayna and Reinman, Glenn},
year = 2012,
booktitle = {Proceedings of the 49th Annual Design Automation Conference},
optabstract = {This work discusses a hardware architectural support for accelerator-rich CMPs (ARC). First, we present a hardware resource management scheme for accelerator sharing. This scheme supports sharing and arbitration of multiple cores for a common set of accelerators, and it uses a hardware-based arbitration mechanism to provide feedback to cores to indicate the wait time before a particular resource becomes available. Second, we propose a light-weight interrupt system to reduce the OS overhead of handling interrupts which occur frequently in an accelerator-rich platform. Third, we propose architectural support that allows us to compose a larger virtual accelerator out of multiple smaller accelerators. We have also implemented a complete simulation tool-chain to verify our ARC architecture. Experimental results show significant performance (on average 51X) and energy improvement (on average 17X) compared to approaches using OS-based accelerator management.},
optseries = {DAC '12}
}
@inproceedings{firesim:isca:2018,
title = {{FireSim}: {FPGA}-accelerated Cycle-exact Scale-out System Simulation in the Public Cloud},
author = {Karandikar, Sagar and Mao, Howard and Kim, Donggyu and Biancolin, David and Amid, Alon and Lee, Dayeol and Pemberton, Nathan and Amaro, Emmanuel and Schmidt, Colin and Chopra, Aditya and Huang, Qijing and Kovacs, Kyle and Nikolic, Borivoje and Katz, Randy and Bachrach, Jonathan and Asanovi\'{c}, Krste},
year = 2018,
booktitle = {Proceedings of the 45th Annual International Symposium on Computer Architecture},
optkeywords = {computer architecture, computer networks, computer simulation, data centers, distributed computing, field programmable gate arrays, performance analysis, scalability}
}
@article{ring-all-reduce:jpdc:2009,
title = {Bandwidth Optimal All-Reduce Algorithms for Clusters of Workstations},
author = {Patarasuk, Pitch and Yuan, Xin},
year = 2009,
month = {feb},
journal = {J. Parallel Distrib. Comput.},
volume = 69,
number = 2,
pages = {117–124},
optabstract = {We consider an efficient realization of the all-reduce operation with large data sizes in cluster environments, under the assumption that the reduce operator is associative and commutative. We derive a tight lower bound of the amount of data that must be communicated in order to complete this operation and propose a ring-based algorithm that only requires tree connectivity to achieve bandwidth optimality. Unlike the widely used butterfly-like all-reduce algorithm that incurs network contention in SMP/multi-core clusters, the proposed algorithm can achieve contention-free communication in almost all contemporary clusters, including SMP/multi-core clusters and Ethernet switched clusters with multiple switches. We demonstrate that the proposed algorithm is more efficient than other algorithms on clusters with different nodal architectures and networking technologies when the data size is sufficiently large.},
optnumpages = 8,
optkeywords = {All-reduce, Cluster of workstations, Tree topology, Collective communication}
}
@misc{oneapi,
title = {One API Documentation},
url = {https://spec.oneapi.io/versions/latest/index.html}
}
@misc{opencl,
title = {OpenCL API Documentation},
url = {https://man.opencl.org/}
}
@inproceedings{cxl-model:exhet:2022,
title = {Design and Analysis of CXL Performance Models for Tightly-Coupled Heterogeneous Computing},
author = {Cabrera, Anthony M and Young, Aaron R and Vetter, Jeffrey S},
year = 2022,
booktitle = {Proceedings of the 1st International Workshop on Extreme Heterogeneity Solutions},
keywords = {GPU, CXL, GPU-FPGA collaboration, heterogeneous computing, FPGA},
optlocation = {Seoul, Republic of Korea},
optseries = {ExHET '22}
}
@inproceedings{ftrans:ispled:2020,
title = {Ftrans: energy-efficient acceleration of transformers using fpga},
author = {Li, Bingbing and Pandey, Santosh and Fang, Haowen and Lyv, Yanjun and Li, Ji and Chen, Jieyang and Xie, Mimi and Wan, Lipeng and Liu, Hang and Ding, Caiwen},
year = 2020,
booktitle = {ISPLES}
}
@misc{ner-transformer,
title = {Transformers based Named Entity Recognition models},
url = {https://huggingface.co/Jean-Baptiste/roberta-large-ner-english}
}
@misc{broadcom:pcie-switches,
title = {Broadcom PEX88000 Managed PCI Express 4.0 Switches},
url = {https://www.broadcom.com/products/pcie-switches-bridges/expressfabric}
}
@article{axdimm:ieee-micro:2021,
title = {Near-Memory Processing in Action: Accelerating Personalized Recommendation with AxDIMM},
author = {Ke, Liu and Zhang, Xuan and So, Jinin and Lee, Jong-Geon and Kang, Shin-Haeng and Lee, Sukhan and Han, Songyi and Cho, Yeongon and Kim, Jin Hyun and Kwon, Yongsuk and Kim, Kyungsoo and Jung, Jin and Yun, Ilkwon and Park, Sung Joo and Park, Hyunsun and Song, Joonho and Cho, Jeonghyeon and Sohn, Kyomin and Kim, Nam Sung and Lee, Hsien-Hsin Sean},
year = 2021,
journal = {IEEE Micro}
}
@misc{microsoft-azure:zipline:2019,
title = {Azure Zipline},
url = {https://azure.microsoft.com/en-us/blog/improved-cloud-service-performance-through-asic-acceleration/}
}
@misc{samsung-smartSSD:documentation:2020,
title = {SmartSSD Documentation},
url = {https://www.xilinx.com/content/dam/xilinx/support/documents/boards_and_kits/accelerator-cards/1_3/ug1382-smartssd-csd.pdf}
}
@misc{napi:kernel:2022,
title = {NAPI},
url = {https://www.kernel.org/doc/html/next/networking/napi.html}
}
@misc{xilinx-dma_buf:xrt:2022,
title = {Xilinx XRT DMA-BUF API},
url = {https://xilinx.github.io/XRT/master/html/xrt_native_apis.html#dma-buf-api}
}
@misc{dma_buf:kernel:2022,
title = {dma-buf},
url = {https://docs.kernel.org/driver-api/dma-buf.html}
}
@misc{supermicro-sapphire-rapids,
title = {Intel Built-In Accelerators},
url = {https://www.supermicro.com/en/accelerators/intel/built-in-on-demand}
}
@inproceedings{intel-sapphire-rapids:hotchips:2021,
title = {Sapphire Rapids},
author = {Biswas, Arijit},
year = 2021,
booktitle = {Hot Chips}
}
@misc{intel-vtune-top-down,
title = {Intel VTune Top-down Analysis},
url = {https://indico.cern.ch/event/280897/contributions/1628888/attachments/515367/711139/Top_Down_for_CERN_2nd_workshop_-_Ahmad_Yasin.pdf}
}
@misc{intel-vtune,
title = {Intel VTune Profiler},
url = {https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html}
}
@misc{wiki-spectrogram,
title = {Spectrogram},
url = {https://en.wikipedia.org/wiki/Spectrogram}
}
@misc{wiki-mel-scale,
title = {Mel scale},
url = {https://en.wikipedia.org/wiki/Mel_scale}
}
@article{nitro-for-hpc:ieee-micro:2020,
title = {A Cloud-Optimized Transport Protocol for Elastic and Scalable HPC},
author = {Shalev, Leah and Ayoub, Hani and Bshara, Nafea and Sabbag, Erez},
year = 2020,
journal = {IEEE Micro},
volume = 40,
number = 6
}
@misc{aws-nitro,
title = {{AWS Nitro}},
url = {https://aws.amazon.com/blogs/hpc/bare-metal-performance-with-the-aws-nitro-system/}
}
@inproceedings{accelnet:nsdi:2018,
title = {Azure Accelerated Networking: {SmartNICs} in the Public Cloud},
author = {Daniel Firestone and Andrew Putnam and Sambhrama Mundkur and Derek Chiou and Alireza Dabagh and Mike Andrewartha and Hari Angepat and Vivek Bhanu and Adrian Caulfield and Eric Chung and Harish Kumar Chandrappa and Somesh Chaturmohta and Matt Humphrey and Jack Lavier and Norman Lam and Fengfen Liu and Kalin Ovtcharov and Jitu Padhye and Gautham Popuri and Shachar Raindel and Tejas Sapre and Mark Shaw and Gabriel Silva and Madhan Sivakumar and Nisheeth Srivastava and Anshuman Verma and Qasim Zuhair and Deepak Bansal and Doug Burger and Kushagra Vaid and David A. Maltz and Albert Greenberg},
year = 2018,
booktitle = {NSDI}
}
@inproceedings{coyote:osdi:2020,
title = {Do {OS} abstractions make sense on {FPGAs}?},
author = {Dario Korolija and Timothy Roscoe and Gustavo Alonso},
year = 2020,
booktitle = {OSDI}
}
@inproceedings{optimus-hypervisor:asplos:2020,
title = {A Hypervisor for Shared-Memory FPGA Platforms},
author = {Ma, Jiacheng and Zuo, Gefei and Loughlin, Kevin and Cheng, Xiaohe and Liu, Yanqiang and Eneyew, Abel Mulugeta and Qi, Zhengwei and Kasikci, Baris},
year = 2020,
booktitle = {ASPLOS},
optabstract = {Cloud providers widely deploy FPGAs as application-specific accelerators for customer use. These providers seek to multiplex their FPGAs among customers via virtualization, thereby reducing running costs. Unfortunately, most virtualization support is confined to FPGAs that expose a restrictive, host-centric programming model in which accelerators cannot issue direct memory accesses (DMAs). The host-centric model incurs high runtime overhead for workloads that exhibit pointer chasing. Thus, FPGAs are beginning to support a shared-memory programming model in which accelerators can issue DMAs. However, virtualization support for shared-memory FPGAs is limited. This paper presents Optimus, the first hypervisor that supports scalable shared-memory FPGA virtualization. Optimus offers both spatial multiplexing and temporal multiplexing to provide efficient and flexible sharing of each accelerator on an FPGA. To share the FPGA-CPU interconnect at a high clock frequency, Optimus implements a multiplexer tree. To isolate each guest's address space, Optimus introduces the technique of page table slicing as a hardware-software co-design. To support preemptive temporal multiplexing, Optimus provides an accelerator preemption interface. We show that Optimus supports eight physical accelerators on a single FPGA and improves the aggregate throughput of twelve real-world benchmarks by 1.98x-7x.}
}
@misc{nvidia-dali:2018,
title = {Nvidia DALI},
url = {https://developer.nvidia.com/dali}
}
@misc{gpudirect:2019,
title = {GPUDirect},
url = {https://developer.nvidia.com/gpudirect}
}
@inproceedings{optimusprime:asplos:2020,
title = {Optimus Prime: Accelerating Data Transformation in Servers},
author = {Pourhabibi, Arash and Gupta, Siddharth and Kassir, Hussein and Sutherland, Mark and Tian, Zilu and Drumond, Mario Paulo and Falsafi, Babak and Koch, Christoph},
year = 2020,
booktitle = {ASPLOS}
}
@inproceedings{dcs:micro:2015,
title = {DCS: A Fast and Scalable Device-Centric Server Architecture},
author = {Ahn, Jaehyung and Kwon, Dongup and Kim, Youngsok and Ajdari, Mohammadamin and Lee, Jaewon and Kim, Jangwoo},
year = 2015,
booktitle = {MICRO},
optabstract = {Conventional servers have achieved high performance by employing fast CPUs to run compute-intensive workloads, while making operating systems manage relatively slow I/O devices through memory accesses and interrupts. However, as the emerging workloads are becoming heavily data-intensive and the emerging devices (e.g., NVM storage, high-bandwidth NICs, and GPUs) come to enable low-latency and high-bandwidth device operations, the traditional host-centric server architectures fail to deliver high performance due to their inefficient device handling mechanisms. Furthermore, without resolving the architecture inefficiency, the performance loss will continue to increase as the emerging devices become faster.In this paper, we propose DCS, a novel device-centric server architecture to fully exploit the potential of the emerging devices so that the server performance nicely scales with the performance of the devices. The key idea of DCS is to orchestrate the devices to directly communicate with each other while selectively bypassing the host. The host becomes responsible for only few device-related operations (e.g., filesystem lookup). In this way, DCS achieves high I/O performance by direct inter-device communications and high computation performance by fully utilizing the host-side resources. To implement DCS, we introduce DCS Engine, a custom hardware device to orchestrate devices via standard I/O protocols (i.e., PCIe and NVMe), along with its device driver and user-level library. We show that our FPGA-based DCS prototype significantly improves the performance of emerging server workloads and the architecture will nicely scale with the performance of the devices.}
}
@inproceedings{dcs-ctrl:isca:2018,
title = {DCS-ctrl: A Fast and Flexible Device-Control Mechanism for Device-Centric Server Architecture},
author = {Kwon, Dongup and Ahn, Jaehyung and Chae, Dongju and Ajdari, Mohammadamin and Lee, Jaewon and Bae, Suheon and Kim, Youngsok and Kim, Jangwoo},
year = 2018,
booktitle = {ISCA}
}
@article{tmdmpi:trets:2010,
title = {MPI as a Programming Model for High-Performance Reconfigurable Computers},
author = {Salda\~{n}a, Manuel and Patel, Arun and Madill, Christopher and Nunes, Daniel and Wang, Danyao and Chow, Paul and Wittig, Ralph and Styles, Henry and Putnam, Andrew},
year = 2010,
journal = {ACM Trans. Reconfigurable Technol. Syst.},
volume = 3,
number = 4,
optabstract = {High-Performance Reconfigurable Computers (HPRCs) consist of one or more standard microprocessors tightly-coupled with one or more reconfigurable FPGAs. HPRCs have been shown to provide good speedups and good cost/performance ratios, but not necessarily ease of use, leading to a slow acceptance of this technology. HPRCs introduce new design challenges, such as the lack of portability across platforms, incompatibilities with legacy code, users reluctant to change their code base, a prolonged learning curve, and the need for a system-level Hardware/Software co-design development flow. This article presents the evolution and current work on TMD-MPI, which started as an MPI-based programming model for Multiprocessor Systems-on-Chip implemented in FPGAs, and has now evolved to include multiple X86 processors. TMD-MPI is shown to address current design challenges in HPRC usage, suggesting that the MPI standard has enough syntax and semantics to program these new types of parallel architectures. Also presented is the TMD-MPI Ecosystem, which consists of research projects and tools that are developed around TMD-MPI to further improve HPRC usability. Finally, we present preliminary communication performance measurements.}
}
@inproceedings{tmdmpi:fpl:2006,
title = {TMD-MPI: An MPI Implementation for Multiple Processors Across Multiple FPGAs},
author = {Saldana, Manuel and Chow, Paul},
year = 2006,
booktitle = {FPL}
}
@article{asicclouds:cacm:2020,
title = {ASIC Clouds: Specializing the Datacenter for Planet-Scale Applications},
author = {Taylor, Michael Bedford and Vega, Luis and Khazraee, Moein and Magaki, Ikuo and Davidson, Scott and Richmond, Dustin},
year = 2020,
journal = {CACM}
}
@article{meta-inference-accelerator:arxiv:2021,
title = {First-generation Inference Accelerator Deployment at Facebook},
author = {Anderson, Michael and Chen, Benny and Chen, Stephen and Deng, Summer and Fix, Jordan and Gschwind, Michael and Kalaiah, Aravind and Kim, Changkyu and Lee, Jaewon and Liang, Jason and others},
year = 2021,
journal = {arXiv preprint}
}
@article{meta-training:arxiv:2020,
title = {Deep Learning Training in Facebook Data Centers: Design of Scale-up and Scale-out Systems},
author = {Naumov, Maxim and Kim, John and Mudigere, Dheevatsa and Sridharan, Srinivas and Wang, Xiaodong and Zhao, Whitney and Yilmaz, Serhat and Kim, Changkyu and Yuen, Hector and Ozdal, Mustafa and Nair, Krishnakumar and Gao, Isabel and Su, Bor-Yiing and Yang, Jiyan and Smelyanskiy, Mikhail},
year = 2020,
journal = {arXiv preprint}
}
@misc{meta-mount-shasta:2019,
title = {Mount Shasta for Video Transcoding},
url = {https://engineering.fb.com/2019/03/14/data-center-engineering/accelerating-infrastructure/}
}
@misc{nxp-powerquad:2019,
title = {AN12282: Digital Signal Processing for NXP LPC5500 Using PowerQuad},
url = {https://www.nxp.com/docs/en/application-note/AN12282.pdf}
}
@misc{analog-devices-ffta:2019,
title = {C/C++ Library Manual for SHARC Processors},
url = {https://www.analog.com/media/en/dsp-documentation/softwaremanuals/cces-sharclibrary-manual.pdf}
}
@inproceedings{fftw-acc:hpec:2022,
title = {A High Throughput Hardware Accelerator for FFTW Codelets: A First Look},
author = {Tang, Larry and Chen, Siyuan and Harisrikanth, Keshav and Xu, Guanglin and Mai, Ken and Franchetti, Franz},
year = 2022,
booktitle = {HPEC)}
}
@inproceedings{facc:pldi:2022,
title = {Bind the Gap: Compiling Real Software to Hardware FFT Accelerators},
author = {Woodruff, Jackson and Armengol-Estap\'{e}, Jordi and Ainsworth, Sam and O'Boyle, Michael F. P.},
year = 2022,
booktitle = {PLDI},
optabstract = {Specialized hardware accelerators continue to be a source of performance improvement. However, such specialization comes at a programming price. The fundamental issue is that of a mismatch between the diversity of user code and the functionality of fixed hardware, limiting its wider uptake. Here we focus on a particular set of accelerators: those for Fast Fourier Transforms. We present FACC (Fourier ACcelerator Compiler), a novel approach to automatically map legacy code to Fourier Transform accelerators. It automatically generates drop-in replacement adapters using Input-Output (IO)-based program synthesis that bridge the gap between user code and accelerators. We apply FACC to unmodified GitHub C programs of varying complexity and compare against two existing approaches. We target FACC to a high-performance library, FFTW, and two hardware accelerators, the NXP PowerQuad and the Analog Devices FFTA, and demonstrate mean speedups of 9x, 17x and 27x respectively}
}
@inproceedings{rxpsc:dac:2021,
title = {New Regular Expressions on Old Accelerators},
author = {Woodruff, Jackson and O’Boyle, Michael F.P.},
year = 2021,
booktitle = {DAC)}
}
@inproceedings{amd-400g-smartnic:hotchips:2022,
title = {AMD 400G Adaptive SmartNIC SoC: Technology preview},
author = {Dastidar, Jaideep and Riddoch, David and Moore, Jason and Pope, Steve and Wesselkamper, Jim},
year = 2022,
booktitle = {2022 IEEE Hot Chips 34 Symposium (HCS)}
}
@inproceedings{auto-nic-offload:asplos:2021,
title = {Autonomous NIC Offloads},
author = {Pismenny, Boris and Eran, Haggai and Yehezkel, Aviad and Liss, Liran and Morrison, Adam and Tsafrir, Dan},
year = 2021,
booktitle = {ASPLOS},
optabstract = {CPUs routinely offload to NICs network-related processing tasks like packet segmentation and checksum. NIC offloads are advantageous because they free valuable CPU cycles. But their applicability is typically limited to layer≤4 protocols (TCP and lower), and they are inapplicable to layer-5 protocols (L5Ps) that are built on top of TCP. This limitation is caused by a misfeature we call ”offload dependence,” which dictates that L5P offloading additionally requires offloading the underlying layer≤4 protocols and related functionality: TCP, IP, firewall, etc. The dependence of L5P offloading hinders innovation, because it implies hard-wiring the complicated, ever-changing implementation of the lower-level protocols. We propose ”autonomous NIC offloads,” which eliminate offload dependence. Autonomous offloads provide a lightweight software-device architecture that accelerates L5Ps without having to migrate the entire layer≤4 TCP/IP stack into the NIC. A main challenge that autonomous offloads address is coping with out-of-sequence packets. We implement autonomous offloads for two L5Ps: (i) NVMe-over-TCP zero-copy and CRC computation, and (ii) https authentication, encryption, and decryption. Our autonomous offloads increase throughput by up to 3.3x, and they deliver CPU consumption and latency that are as low as 0.4x and 0.7x, respectively. Their implementation is already upstreamed in the Linux kernel, and they will be supported in the next-generation of Mellanox NICs.}
}
@inproceedings{regx:micro:2012,
title = {Designing a Programmable Wire-Speed Regular-Expression Matching Accelerator},
author = {Lunteren, Jan Van and Hagleitner, Christoph and Heil, Timothy and Biran, Giora and Shvadron, Uzi and Atasu, Kubilay},
year = 2012,
booktitle = {MICRO}
}
@inproceedings{hare:micro:2016,
title = {HARE: Hardware accelerator for regular expressions},
author = {Gogte, Vaibhav and Kolli, Aasheesh and Cafarella, Michael J. and D'Antoni, Loris and Wenisch, Thomas F.},
year = 2016,
booktitle = {MICRO}
}
@inproceedings{cloud-scale-acc:micro:2016,
title = {A cloud-scale acceleration architecture},
author = {Caulfield, Adrian M. and Chung, Eric S. and Putnam, Andrew and Angepat, Hari and Fowers, Jeremy and Haselman, Michael and Heil, Stephen and Humphrey, Matt and Kaur, Puneet and Kim, Joo-Young and Lo, Daniel and Massengill, Todd and Ovtcharov, Kalin and Papamichael, Michael and Woods, Lisa and Lanka, Sitaram and Chiou, Derek and Burger, Doug},
year = 2016,
booktitle = {MICRO}
}
@inproceedings{cheetah:hpca:2021,
title = {Cheetah: Optimizing and Accelerating Homomorphic Encryption for Private Inference},
author = {Reagen, Brandon and Choi, Woo-Seok and Ko, Yeongil and Lee, Vincent T. and Lee, Hsien-Hsin S. and Wei, Gu-Yeon and Brooks, David},
year = 2021,
booktitle = {HPCA}
}
@inproceedings{bts:isca:2022,
title = {BTS: An Accelerator for Bootstrappable Fully Homomorphic Encryption},
author = {Kim, Sangpyo and Kim, Jongmin and Kim, Michael Jaemin and Jung, Wonkyung and Kim, John and Rhu, Minsoo and Ahn, Jung Ho},
year = 2022,
booktitle = {ISCA},
optabstract = {Homomorphic encryption (HE) enables the secure offloading of computations to the cloud by providing computation on encrypted data (ciphertexts). HE is based on noisy encryption schemes in which noise accumulates as more computations are applied to the data. The limited number of operations applicable to the data prevents practical applications from exploiting HE. Bootstrapping enables an unlimited number of operations or fully HE (FHE) by refreshing the ciphertext. Unfortunately, bootstrapping requires a significant amount of additional computation and memory bandwidth as well. Prior works have proposed hardware accelerators for computation primitives of FHE. However, to the best of our knowledge, this is the first to propose a hardware FHE accelerator that supports bootstrapping as a first-class citizen.In particular, we propose BTS --- Bootstrappable, Technology-driven, Secure accelerator architecture for FHE. We identify the challenges of supporting bootstrapping in the accelerator and analyze the off-chip memory bandwidth and computation required. In particular, given the limitations of modern memory technology, we identify the HE parameter sets that are efficient for FHE acceleration. Based on the insights gained from our analysis, we propose BTS, which effectively exploits the parallelism innate in HE operations by arranging a massive number of processing elements in a grid. We present the design and microarchitecture of BTS, including a network-on-chip design that exploits a deterministic communication pattern. BTS shows 5,556\texttimes{} and 1,306\texttimes{} improved execution time on ResNet-20 and logistic regression over a CPU, with a chip area of 373.6mm2 and up to 163.2W of power.}
}
@misc{ibm-aiu:2022,
title = {{IBM} Artificial Intelligence Unit},
url = {https://research.ibm.com/blog/ibm-artificial-intelligence-unit-aiu}
}
@inproceedings{ibm-compression-accelerator:isca:2020,
title = {Data Compression Accelerator on IBM POWER9 and z15 Processors : Industrial Product},
author = {Abali, Bulent and Blaner, Bart and Reilly, John and Klein, Matthias and Mishra, Ashutosh and Agricola, Craig B. and Sendir, Bedri and Buyuktosunoglu, Alper and Jacobi, Christian and Starke, William J. and Myneni, Haren and Wang, Charlie},
year = 2020,
booktitle = {ISCA}
}
@inproceedings{google-vcu:asplos:2021,
title = {Warehouse-Scale Video Acceleration: Co-Design and Deployment in the Wild},
author = {Ranganathan, Parthasarathy and Stodolsky, Daniel and Calow, Jeff and Dorfman, Jeremy and Guevara, Marisabel and Smullen IV, Clinton Wills and Kuusela, Aki and Balasubramanian, Raghu and Bhatia, Sandeep and Chauhan, Prakash and Cheung, Anna and Chong, In Suk and Dasharathi, Niranjani and Feng, Jia and Fosco, Brian and Foss, Samuel and Gelb, Ben and Gwin, Sara J. and Hase, Yoshiaki and He, Da-ke and Ho, C. Richard and Huffman Jr., Roy W. and Indupalli, Elisha and Jayaram, Indira and Kongetira, Poonacha and Kyaw, Cho Mon and Laursen, Aaron and Li, Yuan and Lou, Fong and Lucke, Kyle A. and Maaninen, JP and Macias, Ramon and Mahony, Maire and Munday, David Alexander and Muroor, Srikanth and Penukonda, Narayana and Perkins-Argueta, Eric and Persaud, Devin and Ramirez, Alex and Rautio, Ville-Mikko and Ripley, Yolanda and Salek, Amir and Sekar, Sathish and Sokolov, Sergey N. and Springer, Rob and Stark, Don and Tan, Mercedes and Wachsler, Mark S. and Walton, Andrew C. and Wickeraad, David A. and Wijaya, Alvin and Wu, Hon Kwan},
year = 2021,
booktitle = {ASPLOS},
optabstract = {Video sharing (e.g., YouTube, Vimeo, Facebook, TikTok) accounts for the majority of internet traffic, and video processing is also foundational to several other key workloads (video conferencing, virtual/augmented reality, cloud gaming, video in Internet-of-Things devices, etc.). The importance of these workloads motivates larger video processing infrastructures and – with the slowing of Moore’s law – specialized hardware accelerators to deliver more computing at higher efficiencies. This paper describes the design and deployment, at scale, of a new accelerator targeted at warehouse-scale video transcoding. We present our hardware design including a new accelerator building block – the video coding unit (VCU) – and discuss key design trade-offs for balanced systems at data center scale and co-designing accelerators with large-scale distributed software systems. We evaluate these accelerators “in the wild" serving live data center jobs, demonstrating 20-33x improved efficiency over our prior well-tuned non-accelerated baseline. Our design also enables effective adaptation to changing bottlenecks and improved failure management, and new workload capabilities not otherwise possible with prior systems. To the best of our knowledge, this is the first work to discuss video acceleration at scale in large warehouse-scale environments.}
}
@misc{aws-inferentia:2019,
title = {{AWS} Inferentia},
url = {https://aws.amazon.com/machine-learning/inferentia/}
}
@misc{aws-trainium:2022,
title = {{AWS} Trainium},
url = {https://aws.amazon.com/machine-learning/trainium/}
}
@inproceedings{ava:asplos:2020,
title = {AvA: Accelerated Virtualization of Accelerators},
author = {Yu, Hangchen and Peters, Arthur Michener and Akshintala, Amogh and Rossbach, Christopher J.},
year = 2020,
booktitle = {ASPLOS}
}
@inproceedings{synergy:asplos:2021,
title = {Compiler-Driven FPGA Virtualization with SYNERGY},
author = {Landgraf, Joshua and Yang, Tiffany and Lin, Will and Rossbach, Christopher J. and Schkufza, Eric},
year = 2021,
booktitle = {ASPLOS}
}
@inproceedings{gables:hpca:2019,
title = {Gables: A Roofline model for Mobile SoCs},
author = {Hill, Mark and Reddi, Vijay Janapa},
year = 2019,
booktitle = {HPCA}
}
@article{poas:arxiv:2022,
title = {POAS: A High-performance Scheduling Framework For Exploiting Accelerator Level Parallelism},
author = {Mart{\'\i}nez, Pablo Antonio and Bernab{\'e}, Gregorio and Garc{\'\i}a, Jose Manuel},
year = 2022,
journal = {arXiv preprint}
}
@inproceedings{meet-the-walker:micro:2013,
title = {Meet the Walkers: Accelerating Index Traversals for in-Memory Databases},
author = {Kocberber, Onur and Grot, Boris and Picorel, Javier and Falsafi, Babak and Lim, Kevin and Ranganathan, Parthasarathy},
year = 2013,
booktitle = {MICRO}
}
@article{alp:cacm:2021,
title = {Accelerator-level Parallelism},
author = {Hill, Mark D and Reddi, Vijay Janapa},
year = 2021,
journal = {Communications of the ACM},
volume = 64,
number = 12,
pages = {36--38}
}
@inproceedings{robomorphic:asplos:2021,
title = {Robomorphic Computing: A Design Methodology for Domain-Specific Accelerators Parameterized by Robot Morphology},
author = {Neuman, Sabrina M. and Plancher, Brian and Bourgeat, Thomas and Tambe, Thierry and Devadas, Srinivas and Reddi, Vijay Janapa},
year = 2021,
booktitle = {ASPLOS}
}
@inproceedings{dua:nsdi:2019,
title = {Direct Universal Access: Making Data Center Resources Available to {FPGA}},
author = {Ran Shu and Peng Cheng and Guo Chen and Zhiyuan Guo and Lei Qu and Yongqiang Xiong and Derek Chiou and Thomas Moscibroda},
year = 2019,
booktitle = {NSDI}
}
@inproceedings{morpheus:isca:2016,
title = {Morpheus: Creating Application Objects Efficiently for Heterogeneous Computing},
author = {Tseng, Hung-Wei and Zhao, Qianchen and Zhou, Yuxiao and Gahagan, Mark and Swanson, Steven},
year = 2016,
booktitle = {ISCA}
}
@inproceedings{lynx:asplos:2020,
title = {Lynx: A SmartNIC-Driven Accelerator-Centric Architecture for Network Servers},
author = {Tork, Maroun and Maudlej, Lina and Silberstein, Mark},
year = 2020,
booktitle = {ASPLOS}
}
@inproceedings{zeppelin:isscc:2018,
title = {Zeppelin: An SoC for multichip architectures},
author = {Beck, Noah and White, Sean and Paraschou, Milam and Naffziger, Samuel},
year = 2018,
booktitle = {IEEE ISSCC}
}
@misc{xilinx-xdma-perf-limit,
title = {Xilinx XDMA Performance},
url = {https://support.xilinx.com/s/article/68049}
}
@misc{xilinx-xdma,
title = {Xilinx XDMA Performance},
url = {https://github.com/Xilinx/dma_ip_drivers/tree/master/XDMA}
}
@misc{linux-gem-lwn,
title = {LWN.net article on GEM},
url = {https://lwn.net/Articles/283798/}
}
@misc{linux-drm-gem,
title = {Linux kernel DRM-GEM drivers},
url = {https://www.kernel.org/doc/html/latest/gpu/drm-mm.html}
}
@inproceedings{acc-yolov3:iscas:2020,
title = {Accelerating Tiny YOLOv3 using FPGA-Based Hardware/Software Co-Design},
author = {Ahmad, Afzal and Pasha, Muhammad Adeel and Raza, Ghulam Jilani},
year = 2020,
booktitle = {IEEE ISCAS}
}
@article{chiosa:pvldb:2022,
title = {Hardware Acceleration of Compression and Encryption in SAP HANA},
author = {Chiosa, Monica and Maschi, Fabio and M\"{u}ller, Ingo and Alonso, Gustavo and May, Norman},
year = 2022,
journal = {Proc. VLDB Endow.},
volume = 15,
number = 12,
pages = {3277–3291}
}
@inproceedings{doppiodb:fpl:2017,
title = {doppioDB: A hardware accelerated database},
author = {Sidler, David and Owaida, Muhsen and István, Zsolt and Kara, Kaan and Alonso, Gustavo},
year = 2017,
booktitle = {FPL}
}
@inproceedings{casper:fpga:2014,
title = {Hardware Acceleration of Database Operations},
author = {Casper, Jared and Olukotun, Kunle},
year = 2014,
booktitle = {ACM FPGA}
}
@misc{microsoft-presidio,
title = {Presidio: Data Protection and Anonymization SDK},
url = {https://microsoft.github.io/presidio/}
}
@misc{urban-sound-detection,
title = {Urban Sound Detection},
url = {https://urbansounddataset.weebly.com/urbansound8k.html}
}
@inproceedings{rldbs:ijcai:2020,
title = {Reinforcement Learning Framework for Deep Brain Stimulation Study},
author = {Krylov, Dmitrii and des Combes, Remi and Laroche, Romain and Rosenblum, Michael and Dylov, Dmitry V},
year = 2020,
booktitle = {IJCAI}
}
@misc{aws-vt1-instance,
title = {AWS vt1 instance},
url = {https://xilinx.github.io/video-sdk/v1.5/getting_started_on_vt1.html}
}
@misc{xilinx-vitis-libraries,
title = {Xilinx Vitis Libraries},
url = {https://xilinx.github.io/Vitis_Libraries/}
}
@misc{xilinx-vitis-database,
title = {Xilinx Vitis Database Library},
url = {https://xilinx.github.io/Vitis_Libraries/database/2022.1/index.html}
}
@misc{xilinx-vitis-data-analytics,
title = {Xilinx Vitis Data Analytics Library},
url = {https://xilinx.github.io/Vitis_Libraries/data_analytics/2022.1/index.html}
}
@misc{xilinx-vitis-data-compression,
title = {Xilinx Vitis Data Compression Library},
url = {https://xilinx.github.io/Vitis_Libraries/data_compression/2022.1/index.html}
}
@misc{xilinx-vitis-security,
title = {Xilinx Vitis Security Library},
url = {https://xilinx.github.io/Vitis_Libraries/security/2022.1/index.html}
}
@misc{xilinx-vitis-dsp,
title = {Xilinx Vitis DSP Library},
url = {https://xilinx.github.io/Vitis_Libraries/dsp/2022.1/index.html}
}
@misc{xilinx-u30-vcu,
title = {Xilinx U30 VCU},
url = {https://www.xilinx.com/content/dam/xilinx/support/documents/data_sheets/ds970-u30.pdf}
}
@inproceedings{spin:atc:2017,
title = {{SPIN}: Seamless Operating System Integration of {Peer-to-Peer} {DMA} Between {SSDs} and {GPUs}},
author = {Shai Bergman and Tanya Brokhman and Tzachi Cohen and Mark Silberstein},
year = 2017,
booktitle = {ATC}
}
@inproceedings{p2pdma:apsys:2020,
title = {How Beneficial is Peer-to-Peer DMA?},
author = {Nakamura, Ryo and Kuga, Yohei and Akashi, Kunio},
year = 2020,
booktitle = {APSys}
}
@inproceedings{floem:osdi:2018,
title = {Floem: A Programming System for {NIC-Accelerated} Network Applications},
author = {Phitchaya Mangpo Phothilimthana and Ming Liu and Antoine Kaufmann and Simon Peter and Rastislav Bodik and Thomas Anderson},
year = 2018,
booktitle = {OSDI}
}
@inproceedings{tensorflow:osdi:2016,
title = {{TensorFlow}: A System for {Large-Scale} Machine Learning},
author = {Mart{\'\i}n Abadi and Paul Barham and Jianmin Chen and Zhifeng Chen and Andy Davis and Jeffrey Dean and Matthieu Devin and Sanjay Ghemawat and Geoffrey Irving and Michael Isard and Manjunath Kudlur and Josh Levenberg and Rajat Monga and Sherry Moore and Derek G. Murray and Benoit Steiner and Paul Tucker and Vijay Vasudevan and Pete Warden and Martin Wicke and Yuan Yu and Xiaoqiang Zheng},
year = 2016,
booktitle = {OSDI}
}
@misc{apache:beam,
title = {Apache Beam},
year = 2021,
url = {https://beam.apache.org/}
}
@misc{google:dataflow,
title = {Google Dataflow},
year = 2021,
url = {https://cloud.google.com/dataflow}
}
@inproceedings{naiad:sosp:2013,
title = {Naiad: A Timely Dataflow System},
author = {Murray, Derek G. and McSherry, Frank and Isaacs, Rebecca and Isard, Michael and Barham, Paul and Abadi, Mart\'{\i}n},
year = 2013,
booktitle = {SOSP}
}
@inproceedings{dandelion:sosp:2013,
title = {Dandelion: A Compiler and Runtime for Heterogeneous Systems},
author = {Rossbach, Christopher J. and Yu, Yuan and Currey, Jon and Martin, Jean-Philippe and Fetterly, Dennis},
year = 2013,
booktitle = {SOSP}
}
@inproceedings{logca:isca:2017,
title = {LogCA: A High-Level Performance Model for Hardware Accelerators},
author = {Altaf, Muhammad Shoaib Bin and Wood, David A.},
year = 2017,
booktitle = {ISCA}
}
@article{tf.data:pvldb:2021,
title = {Tf.Data: A Machine Learning Data Processing Framework},
author = {Murray, Derek G. and \v{S}im\v{s}a, Ji\v{r}\'{\i} and Klimovic, Ana and Indyk, Ihor},
year = 2021,
journal = {Proc. VLDB Endow.},
volume = 14,
number = 12
}
@inproceedings{dsi-dlrm:isca:2022,
title = {Understanding Data Storage and Ingestion for Large-Scale Deep Recommendation Model Training: Industrial Product},
author = {Zhao, Mark and Agarwal, Niket and Basant, Aarti and Gedik, Bu\u{g}ra and Pan, Satadru and Ozdal, Mustafa and Komuravelli, Rakesh and Pan, Jerry and Bao, Tianshu and Lu, Haowei and Narayanan, Sundaram and Langman, Jack and Wilfong, Kevin and Rastogi, Harsha and Wu, Carole-Jean and Kozyrakis, Christos and Pol, Parik},
year = 2022,
booktitle = {ISCA}
}
@inproceedings{urbansound-dataset:mm:2014,
title = {A Dataset and Taxonomy for Urban Sound Research},
author = {Salamon, Justin and Jacoby, Christopher and Bello, Juan Pablo},
year = 2014,
booktitle = {ACM Multimedia}
}
@inproceedings{tut-database:eusipco:2016,
title = {TUT database for acoustic scene classification and sound event detection},
author = {Mesaros, Annamaria and Heittola, Toni and Virtanen, Tuomas},
year = 2016,
booktitle = {2016 24th European Signal Processing Conference (EUSIPCO)}
}
@inproceedings{flexdriver:asplos:2022,
title = {FlexDriver: A Network Driver for Your Accelerator},
author = {Eran, Haggai and Fudim, Maxim and Malka, Gabi and Shalom, Gal and Cohen, Noam and Hermony, Amit and Levi, Dotan and Liss, Liran and Silberstein, Mark},
year = 2022,
booktitle = {ASPLOS}
}
@inproceedings{nds:micro:2021,
title = {NDS: N-Dimensional Storage},
author = {Liu, Yu-Chia and Tseng, Hung-Wei},
year = 2021,
booktitle = {MICRO}
}
@misc{nvidia-v100,
title = {Nvidia V100 overview},
url = {https://www.nvidia.com/en-us/data-center/v100},
howpublished = {\url{https://www.nvidia.com/en-us/data-center/v100/}}
}
@misc{intel-dsa,
title = {Intel Data Streaming Accelerator},
url = {https://www.intel.com/content/www/us/en/develop/articles/intel-data-streaming-accelerator-architecture-specification.html}
}
@inproceedings{protobuf:isca:2021,
title = {A Hardware Accelerator for Protocol Buffers},
author = {Karandikar, Sagar and Leary, Chris and Kennelly, Chris and Zhao, Jerry and Parimi, Dinesh and Nikolic, Borivoje and Asanovic, Krste and Ranganathan, Parthasarathy},
year = 2021,
booktitle = {MICRO}
}
@inproceedings{peltenburg-2019-fletcher,
title = {Fletcher: A framework to efficiently integrate FPGA accelerators with apache arrow},
author = {Peltenburg, Johan and Van Straten, Jeroen and Wijtemans, Lars and Van Leeuwen, Lars and Al-Ars, Zaid and Hofstee, Peter},
booktitle = {FPL},
optyear = 2019
}
@inproceedings{hgum:reconfig:2017,
title = {Hgum: Messaging framework for hardware accelerators},
author = {Zhang, Sizhuo and Angepat, Hari and Chiou, Derek},
year = 2017,
booktitle = {ReConFig}
}
@inproceedings{llama:socc:2021,
title = {Llama: A Heterogeneous \& Serverless Framework for Auto-Tuning Video Analytics Pipelines},
author = {Romero, Francisco and Zhao, Mark and Yadwadkar, Neeraja J. and Kozyrakis, Christos},
year = 2021,
booktitle = {SoCC}
}
@inproceedings{interstellar:asplos:2020,
title = {Interstellar: Using Halide's Scheduling Language to Analyze DNN Accelerators},
author = {Yang, Xuan and Gao, Mingyu and Liu, Qiaoyi and Setter, Jeff and Pu, Jing and Nayak, Ankita and Bell, Steven and Cao, Kaidi and Ha, Heonjae and Raina, Priyanka and Kozyrakis, Christos and Horowitz, Mark},
year = 2020,
booktitle = {ASPLOS}
}
@inproceedings{decibel:nsdi:2017,
title = {Decibel: Isolation and Sharing in Disaggregated {Rack-Scale} Storage},
author = {Mihir Nanavati and Jake Wires and Andrew Warfield},
year = 2017,
booktitle = {NSDI}
}
@inproceedings{legtchenko:hotstorage:2017,
title = {Understanding {Rack-Scale} Disaggregated Storage},
author = {Sergey Legtchenko and Hugh Williams and Kaveh Razavi and Austin Donnelly and Richard Black and Andrew Douglas and Nathanael Cheriere and Daniel Fryer and Kai Mast and Angela Demke Brown and Ana Klimovic and Andy Slowey and Antony Rowstron},
year = 2017,
booktitle = {HotStorage}
}
@article{do:cacm:2019,
title = {Programmable Solid-State Storage in Future Cloud Datacenters},
author = {Do, Jaeyoung and Sengupta, Sudipta and Swanson, Steven},
year = 2019,
journal = {Commun. ACM},
volume = 62,
number = 6
}
@inproceedings{leapio:asplos:2020,
title = {LeapIO: Efficient and Portable Virtual NVMe Storage on ARM SoCs},
author = {Li, Huaicheng and Hao, Mingzhe and Novakovic, Stanko and Gogte, Vaibhav and Govindan, Sriram and Ports, Dan R. K. and Zhang, Irene and Bianchini, Ricardo and Gunawi, Haryadi S. and Badam, Anirudh},
year = 2020,
booktitle = {ASPLOS}
}
@inproceedings{flash-disaggregation:eurosys:2016,
title = {Flash Storage Disaggregation},
author = {Klimovic, Ana and Kozyrakis, Christos and Thereska, Eno and John, Binu and Kumar, Sanjeev},
year = 2016,
booktitle = {EuroSys}
}
@inproceedings{zhu:cluster:2019,
title = {Efficient User-Level Storage Disaggregation for Deep Learning},
author = {Zhu, Yue and Yu, Weikuan and Jiao, Bing and Mohror, Kathryn and Moody, Adam and Chowdhury, Fahim},
year = 2019,
booktitle = {CLUSTER}
}
@inproceedings{spool:atc:2020,
title = {{Spool}: Reliable Virtualized {NVMe} Storage Pool in Public Cloud Infrastructure},
author = {Shuai Xue and Shang Zhao and Quan Chen and Gang Deng and Zheng Liu and Jie Zhang and Zhuo Song and Tao Ma and Yong Yang and Yanbo Zhou and Keqiang Niu and Sijie Sun and Minyi Guo},
year = 2020,
booktitle = {ATC}
}
@inproceedings{nvmeof-arm:msst:2019,
title = {When NVMe over Fabrics Meets Arm: Performance and Implications},
author = {Jia, Yichen and Anger, Eric and Chen, Feng},
year = 2019,
booktitle = {MSST}
}
@inproceedings{nvmeof:systor:2017,
title = {NVMe-over-Fabrics Performance Characterization and the Path to Low-Overhead Flash Disaggregation},
author = {Guz, Zvika and Li, Harry (Huan) and Shayesteh, Anahita and Balakrishnan, Vijay},
year = 2017,
booktitle = {SYSTOR}
}
@inproceedings{kim:asbd:2017,
title = {How Much Computation Power do you need for Near-Data Processing in Cloud?},
author = {Namhyung Kim and Jeongseob Ahn and Sungpack Hong and Hassan Chafi and Kiyoung Choi},
year = 2017,
booktitle = {ASBD}
}
@inproceedings{clicknp:sigcomm:2016,
title = {ClickNP: Highly Flexible and High Performance Network Processing with Reconfigurable Hardware},
author = {Li, Bojie and Tan, Kun and Luo, Layong (Larry) and Peng, Yanqing and Luo, Renqian and Xu, Ningyi and Xiong, Yongqiang and Cheng, Peng and Chen, Enhong},
year = 2016,
booktitle = {SIGCOMM}
}
@inproceedings{pcie-nic:sigcomm:2018,
title = {Understanding PCIe Performance for End Host Networking},
author = {Neugebauer, Rolf and Antichi, Gianni and Zazo, Jos\'{e} Fernando and Audzevich, Yury and L\'{o}pez-Buedo, Sergio and Moore, Andrew W.},
year = 2018,
booktitle = {SIGCOMM}
}
@inproceedings{memif:asplos:2016,
title = {Memif: Towards Programming Heterogeneous Memory Asynchronously},
author = {Lin, Felix Xiaozhu and Liu, Xu},
year = 2016,
booktitle = {ASPLOS}
}
@misc{aws-s3-latency:2019,
title = {{Amazon CloudWatch Percentiles on Amazon S3}},
year = 2019,
url = {https://aws.amazon.com/blogs/storage/amazon-s3-cloudwatch-percentiles/},
howpublished = {\url{https://aws.amazon.com/blogs/storage/amazon-s3-cloudwatch-percentiles/}}
}
@misc{pcie-p2pdma:lwn:2019,
title = {{Device-to-device memory-transfer offload with P2PDMA}},
year = 2019,
url = {https://lwn.net/Articles/767281/}
}
@misc{pcie-p2pdma:kernel:2018,
title = {{PCI Peer-to-Peer DMA Support}},
year = 2018,
url = {https://docs.kernel.org/driver-api/pci/p2pdma.html}
}
@misc{openwhisk:serverless:2021,
title = {{Apache OpenWhisk}},
url = {https://openwhisk.apache.org/},
howpublished = {\url{https://openwhisk.apache.org/}}
}
@inproceedings{gimbal:sigcomm:2021,
title = {Gimbal: Enabling Multi-Tenant Storage Disaggregation on SmartNIC JBOFs},
author = {Min, Jaehong and Liu, Ming and Chugh, Tapan and Zhao, Chenxingyu and Wei, Andrew and Doh, In Hwan and Krishnamurthy, Arvind},
year = 2021,
booktitle = {SIGCOMM}
}
@inproceedings{i10:osdi:2020,
title = {{TCP}$\approx${RDMA}: {CPU-efficient} Remote Storage Access with i10},
author = {Jaehyun Hwang and Qizhe Cai and Ao Tang and Rachit Agarwal},
year = 2020,
booktitle = {OSDI}
}
@inproceedings{1rma:sigcomm:2020,
title = {{1RMA}: Re-Envisioning Remote Memory Access for Multi-Tenant Datacenters},
author = {Singhvi, Arjun and Akella, Aditya and Gibson, Dan and Wenisch, Thomas F. and Wong-Chan, Monica and Clark, Sean and Martin, Milo M. K. and McLaren, Moray and Chandra, Prashant and Cauble, Rob and Wassel, Hassan M. G. and Montazeri, Behnam and Sabato, Simon L. and Scherpelz, Joel and Vahdat, Amin},
year = 2020,
booktitle = {SIGCOMM}
}
@inproceedings{lim:isca:2009,
title = {Disaggregated Memory for Expansion and Sharing in Blade Servers},
author = {Lim, Kevin and Chang, Jichuan and Mudge, Trevor and Ranganathan, Parthasarathy and Reinhardt, Steven K. and Wenisch, Thomas F.},
year = 2009,
booktitle = {ISCA}
}
@inproceedings{network-for-disaggregation:osdi:2016,
title = {Network Requirements for Resource Disaggregation},
author = {Peter X. Gao and Akshay Narayan and Sagar Karandikar and Joao Carreira and Sangjin Han and Rachit Agarwal and Sylvia Ratnasamy and Scott Shenker},
year = 2016,
booktitle = {OSDI}
}
@inproceedings{reflex:asplos:2017,
title = {ReFlex: Remote Flash $\approx$ Local Flash},
author = {Klimovic, Ana and Litz, Heiner and Kozyrakis, Christos},
year = 2017,
booktitle = {ASPLOS}
}
@misc{azure_serverless_computing:2021,
title = {Azure serverless},
url = {https://azure.microsoft.com/en-us/solutions/serverless/#overview},
howpublished = {\url{https://azure.microsoft.com/en-us/solutions/serverless/#overview}}
}
@misc{google_cloud_functions:2021,
title = {Google Cloud Functions},
url = {https://cloud.google.com/functions/docs/concepts/overview},
howpublished = {\url{https://cloud.google.com/functions/docs/concepts/overview}}
}
@misc{aws_lambda:2021,
title = {AWS Lambda},
url = {https://aws.amazon.com/lambda/},
howpublished = {\url{https://aws.amazon.com/lambda/}}
}
@article{rapl:ieee-micro:2012,
title = {Power-Management Architecture of the Intel Microarchitecture Code-Named Sandy Bridge},
author = {Rotem, Efraim and Naveh, Alon and Ananthakrishnan, Avinash and Weissmann, Eliezer and Rajwan, Doron},
year = 2012,
journal = {IEEE Micro},
volume = 32,
number = 2
}
@article{rapl-action:tompecs:2018,
title = {RAPL in Action: Experiences in Using RAPL for Power Measurements},
author = {Khan, Kashif Nizam and Hirki, Mikael and Niemi, Tapio and Nurminen, Jukka K. and Ou, Zhonghong},
year = 2018,
journal = {ACM Trans. Model. Perform. Eval. Comput. Syst.},
volume = 3,
number = 2
}
@inproceedings{large-scale-ssd:sigmetrics:2015,
title = {A Large-Scale Study of Flash Memory Failures in the Field},
author = {Meza, Justin and Wu, Qiang and Kumar, Sanjev and Mutlu, Onur},
year = 2015,
booktitle = {SIGMETRICS}
}
@inproceedings{heatwatch:hpca:2018,
title = {HeatWatch: Improving 3D NAND Flash Memory Device Reliability by Exploiting Self-Recovery and Temperature Awareness},
author = {Luo, Yixin and Ghose, Saugata and Cai, Yu and Haratsch, Erich F. and Mutlu, Onur},
year = 2018,
booktitle = {HPCA}
}
@inproceedings{stannis:dac:2020,
title = {Stannis: Low-Power Acceleration of DNN Training Using Computational Storage Devices},
author = {HeydariGorji, Ali and Torabzadehkashi, Mahdi and Rezaei, Siavash and Bobarshad, Hossein and Alves, Vladimir and Chou, Pai H.},
year = 2020,
booktitle = {DAC}
}
@inproceedings{barbalace:cidr:2021,
title = {Computational Storage: Where Are We Today?},
author = {Barbalace, Antonio and Do, Jaeyoung},
year = 2021,
booktitle = {CIDR}
}
@article{asic-cloud:cacm:2020,
title = {ASIC Clouds: Specializing the Datacenter for Planet-Scale Applications},
author = {Taylor, Michael Bedford and Vega, Luis and Khazraee, Moein and Magaki, Ikuo and Davidson, Scott and Richmond, Dustin},
year = 2020,