diff --git a/README.md b/README.md index 171f5406..7d298284 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,10 @@ # Algorithmica v3 -Algorithmica is a free and open web book about Computer Science. +Algorithmica is an open-access web book dedicated to the art and science of computing. -If you are concerned with editing, please read the [contributing guide](https://ru.algorithmica.org/contributing/) (in Russian). +You can contribute via [Prose](https://prose.io/) by clicking on the pencil icon on the top right on any page or by editing its source directly on GitHub. We use a slightly different Markdown dialect, so if you are not sure that the change is correct (for example, editing an intricate LaTeX formula), you can install [Hugo](https://gohugo.io/) and build the site locally — or just create a pull request, and a preview link will be automatically generated for you. + +If you happen to speak Russian, please also read the [contributing guidelines](https://ru.algorithmica.org/contributing/). --- @@ -16,11 +18,11 @@ Key technical changes from the [previous version](https://github.com/algorithmic * Rich metadata support (language, sections, TOCs, authors...) * Automated global table of contents * Theming support +* Search support (Lunr) Short-term todo list: -* Search with lunr -* Themes (especially a better dark theme) -* Minor style adjustments for mobile and print versions +* Style adjustments for mobile and print versions * A pdf version of the whole website +* Meta-information support (for Google Scholar and social media) * [Sticky table of contents](https://css-tricks.com/table-of-contents-with-intersectionobserver/) diff --git a/assets/slides.sass b/assets/slides.sass index e69de29b..671ababe 100644 --- a/assets/slides.sass +++ b/assets/slides.sass @@ -0,0 +1,50 @@ +$font-text: 'Source Sans', serif !default +$font-code: 'Inconsolata', monospace !default +$font-headings: 'Garamond', serif !default + +$borders: 1px solid #eaecef !default + +/* fonts */ +@font-face + font-family: 'CMU' + src: url(fonts/cmu.woff2) + +@font-face + font-family: 'Merriweather' + src: url(fonts/merriweather.woff2) + +@font-face + font-family: 'Inconsolata' + src: url(fonts/inconsolata.woff2) + +@font-face + font-family: 'Garamond' + src: url(fonts/garamond.woff2) + +@font-face + font-family: "Open Sans" + src: url(fonts/opensans.woff2) + +@font-face + font-family: "Source Sans" + src: url(fonts/sourcesans.ttf) + +@font-face + font-family: "Crimson" + src: url(fonts/crimson.ttf) + +body + font-family: $font-text + font-size: 24px + +h1 + font-size: 2em + text-align: center + margin-top: 0 + margin-bottom: 20px + +h2 + font-size: 1.5em + +h3 + font-size: 1.25em diff --git a/config.yaml b/config.yaml index 7e4ca1b7..1f196de4 100644 --- a/config.yaml +++ b/config.yaml @@ -8,6 +8,15 @@ outputFormats: baseName: index mediaType: text/html isHTML: true + SearchIndex: + mediaType: "application/json" + baseName: "searchindex" + isPlainText: true + notAlternative: true +outputs: + home: + - HTML + - SearchIndex markup: goldmark: footnote: false # katex conflict @@ -33,8 +42,8 @@ languages: params: repo: "https://github.com/algorithmica-org/algorithmica" reveal_hugo: - theme: white + #theme: white slide_number: true transition: none - #custom_theme: "slides.sass" - #custom_theme_compile: true + custom_theme: "slides.sass" + custom_theme_compile: true diff --git a/content/english/_index.md b/content/english/_index.md index 91cf875a..f319cd0e 100644 --- a/content/english/_index.md +++ b/content/english/_index.md @@ -6,6 +6,8 @@ noToc: true Algorithmica is an open-access web book dedicated to the art and science of computing. -It is created by [Sergey Slotin](http://sereja.me/) and teachers and students of [Tinkoff Generation](https://fintech.tinkoff.ru/study/generation/) — an educational organization that trains about half of the final-stage participants of Russian Olympiad in Informatics. +It is created by [Sergey Slotin](http://sereja.me/) and the teachers and students of [Tinkoff Generation](https://fintech.tinkoff.ru/study/generation/) — a nonprofit educational organization that trains about half of the finalists of the Russian Olympiad in Informatics. -The English version of the website is a work in progress; the only useful thing you can find there is the continuously updated draft of [Algorithms for Modern Hardware](hpc). We are currently more focused on [the Russian version](https://ru.algorithmica.org/), which hosts various course materials that we use ourselves. +The English version of the website is a work in progress; the only useful thing you can find here is the continuously updated draft of [Algorithms for Modern Hardware](hpc). We are currently more focused on [the Russian version](https://ru.algorithmica.org/), which hosts various course materials that we use ourselves. + +If you spot an error, please create an issue on [GitHub](https://github.com/algorithmica-org/algorithmica) or, preferably, fix it right away (the pencil icon on the top-right). diff --git a/content/english/hpc/_index.md b/content/english/hpc/_index.md index 6c2b4af3..9b6aa606 100644 --- a/content/english/hpc/_index.md +++ b/content/english/hpc/_index.md @@ -13,172 +13,113 @@ This is an upcoming high performance computing book titled "Algorithms for Moder Its intended audience is everyone from performance engineers and practical algorithm researchers to undergraduate computer science students who have just finished an advanced algorithms course and want to learn more practical ways to speed up a program than by going from $O(n \log n)$ to $O(n \log \log n)$. -All materials are hosted on GitHub, with code in a [separate repository](https://github.com/sslotin/scmm-code). This isn't a collaborative project, but any contributions and feedback are very much welcome. +All book materials are [hosted on GitHub](https://github.com/algorithmica-org/algorithmica), with code in a [separate repository](https://github.com/sslotin/scmm-code). This isn't a collaborative project, but any contributions and feedback are very much welcome. -### Part I: Performance Engineering +### FAQ -The first part covers the basics of computer architecture and optimization of single-threaded algorithms. +**Bug/typo fixes.** If you spot an error on any page, please do one of these — in the order of preference: -It walks through the main CPU optimization topics such as caching, SIMD and pipelining, and provides brief examples in C++, followed by large case studies where we usually achieve a significant speedup over some STL algorithm or data structure. +- fix it right away by either clicking on the pencil icon on the top right on any page (opens the [Prose](https://prose.io/) editor) or, more traditionally, by modifying the page directly on GitHub (the link to the source is also on the top right); +- create [an issue on GitHub](https://github.com/algorithmica-org/algorithmica/issues); +- [tell me](http://sereja.me/) about it directly; -``` -0. Why Go Beyond Big O -1. Analyzing Performance - 1.1. Computer Architecture & Assembly - 1.2. Negotiating with Compilers - 1.3. Profiling - 1.4. Binary GCD <- 2x faster std::gcd -2. Bit Hacks and Arithmetic - 2.1. Floating-Point Arithmetic - 2.2. Numerical Methods - 2.3. Integer Arithmetic - 2.4. Bit Manipulation - 2.5. Modular Arithmetic - 2.6. Finite Fields - 2.7. Cryptography, Hashing and PRNG - 2.8. Integer Factorization - 2.9. Bignum Arithmetic and the Karatsuba Algorithm - 2.10. Fast Fourier Transform -3. Memory - 3.1. External Memory Model - 3.2. Cache Locality - 3.3. Sublinear Algorithms - 3.4. RAM & CPU Caches - 3.5. Memory Management - 3.6. Layouts for Binary Search <- 5x faster std::lower_bound - 3.7. Implicit Data Structures <- 7x faster segment trees - 3.8. Hash Tables <- 5x faster std::unordered_map -4. SIMD Parallelism - 4.1. Intrinsics and Vector Extensions - 4.2. (Auto-)Vectorization - 4.3. SSE & AVX Cookbook - 4.4. Argmin with SIMD - 4.5. Logistic Regression - 4.6. Bitmaps - 4.7. String Searching <- ?x faster strstr - 4.8. Parsing Integers <- 2x faster scanf("%d") - 4.9. Sorting <- 8x faster std::sort -5. Instruction-Level Parallelism - 5.1. Pipelining and Hazards - 5.2. Throughput Computing <- 2x faster std::accumulate - 5.3. µOps & Scheduling - 5.4. Theoretical Performance Limits - 5.5. Matrix Multiplication <- 100x faster gemm -6. Summary -``` +or leave a comment on some other website where it is being discussed — I read most of [HackerNews](https://news.ycombinator.com/from?site=algorithmica.org), [CodeForces](https://codeforces.com/profile/sslotin), and [Twitter](https://twitter.com/sergey_slotin) threads where I'm tagged. -Among cool things that we will speed up: +**Release date.** The book is split into several parts that I plan to finish sequentially with long breaks in-between. Part I, Performance Engineering, is ~75% complete as of March 2022 and will hopefully be >95% complete by this summer. -- 2x faster GCD (compared to `std::gcd`) -- 5x faster binary search (compared to `std::lower_bound`) -- 7x faster segment trees -- 5x faster hash tables (compared to `std::unordered_map`) -- ~~?x faster popcount~~ -- 2x faster parsing series of integers (compared to `scanf`) -- ?x faster sorting (compared to `std::sort`) -- 2x faster sum (compared to `std::accumulate`) -- 100x faster matrix multiplication (compared to "for-for-for") -- optimal word-size integer factorization (~0.4ms per 60-bit integer) -- optimal Karatsuba Algorithm -- optimal FFT -- argmin at the speed of memory +A "release" for an open-source book like this essentially means: -This work is largely based on blog posts, research papers, conference talks and other work authored by a lot of people: +- finishing all essential sections and filling all the TODOs, +- mostly freezing the table of contents (except for the case studies), +- doing one final round of heavy copyediting (hopefully, with the help of a professional editor — I still haven’t figured out how commas work in English), +- drawing illustrations (I stole a lot of those that are currently displayed), +- making a print-optimized PDF and figuring out the best way to distribute it. -- [Agner Fog](https://agner.org/optimize/) -- [Daniel Lemire](https://lemire.me/en/#publications) -- [Andrei Alexandrescu](https://erdani.com/index.php/about/) -- Chandler Carruth -- [Wojciech Muła](http://0x80.pl/articles/index.html) -- [Malte Skarupke](https://probablydance.com/) -- [Travis Downs](https://travisdowns.github.io/) -- [Brendan Gregg](https://www.brendangregg.com/blog/index.html) -- [Andreas Abel](http://embedded.cs.uni-saarland.de/abel.php) -- [Jakob Kogler](https://cp-algorithms.com/) -- [Igor Ostrovsky](http://igoro.com/) -- [Steven Pigeon](https://hbfs.wordpress.com/) -- [Denis Bakhvalov](https://easyperf.net/notes/) -- [Paul Khuong](https://pvk.ca/) -- [Pat Morin](https://cglab.ca/~morin/) -- [Victor Eijkhout](https://www.tacc.utexas.edu/about/directory/victor-eijkhout) -- [Robert van de Geijn](https://www.cs.utexas.edu/~rvdg/) -- [Edmond Chow](https://www.cc.gatech.edu/~echow/) -- [Peter Cordes](https://stackoverflow.com/users/224132/peter-cordes) -- [ridiculous_fish](https://ridiculousfish.com/blog/) -- Kazushige Goto -- Matt Kulukundis -- Oleksandr Bacherikov +After that, I will mostly be fixing errors and only doing some minor edits reflecting the changes in technology or new algorithm advancements. The e-book/printed editions will most likely be sold on a "pay what you want" basis, and in any case, the web version will always be fully available online. -Volume: 300-400 pages -Release date: early 2022 +**Pre-ordering / financially supporting the book.** Due to my unfortunate citizenship and place of birth, you can't — that is, until I find a way that at the same time complies with international sanctions, doesn't sponsor [the war](https://en.wikipedia.org/wiki/2022_Russian_invasion_of_Ukraine), and won't put me in prison for tax evasion. -### Part II: Parallel Algorithms +So, don't bother. If you want to support this book, just share it and help fix typos — that would be enough. -Concurrency, models of parallelism, green threads and runtimes, cache coherence, synchronization primitives, OpenMP, reductions, scans, list ranking and graph algorithms, lock-free data structures, heterogeneous computing, CUDA, kernels, warps, blocks, matrix multiplication and sorting. +**Translations.** The website has a separate functionality for creating and managing translations — and I've already been contacted by some nice people willing to translate the book into Italian and Chinese (and I will personally translate at least some of it into my native Russian). -Volume: 150-200 pages -Release date: late 2022 / 2023? +However, as the book is still evolving, it is probably not the best idea to start translating it at least until Part I is finished. That said, you are very much encouraged to make translations of any articles and publish them in your blogs — just send me the link so that we can merge it back when centralized translation starts. -### Part III: Distributed Computing +**"Translating" the Russian version.** The articles hosted at [ru.algorithmica.org/cs/](https://ru.algorithmica.org/cs/) are not about advanced performance engineering but mostly about classical computer science algorithms — without discussing how to speed them up beyond asymptotic complexity. Most of the information there is not unique and already exists in English on some other places on the internet: for example, the similar-spirited [cp-algorithms.com](https://cp-algorithms.com/). -Communication-constrained algorithms, message passing, actor model, partitioning, MapReduce, consistency and reliability at scale, storage, compression, scheduling and cloud computing, distributed deep learning. +**Teaching performance engineering in colleges.** One of my goals for writing this book is to change the way computer science — algorithm design, to be more precise — is taught in colleges. Let me elaborate on that. -Release date: ??? +There are two highly impactful textbooks on which most computer science courses are built. Both are undoubtedly outstanding, but [one of them](https://en.wikipedia.org/wiki/The_Art_of_Computer_Programming) is 50 years old, and [the other](https://en.wikipedia.org/wiki/Introduction_to_Algorithms) is 30 years old, and [computers have changed a lot](/hpc/complexity/hardware) since then. Asymptotic complexity is not the sole deciding factor anymore. In modern practical algorithm design, you choose the approach that makes better use of different types of parallelism available in the hardware over the one that theoretically does fewer raw operations on galaxy-scale inputs. -### Part IV: Compilers and Domain-Specific Architectures +And yet, the computer science curricula in most colleges completely ignore this shift. Although there are some great courses that aim to correct that — such as "[Performance Engineering of Software Systems](https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-172-performance-engineering-of-software-systems-fall-2018/)" from MIT, "[Programming Parallel Computers](https://ppc.cs.aalto.fi/)" from Aalto University, and some non-academic ones like Denis Bakhvalov's "[Performance Ninja](https://github.com/dendibakh/perf-ninja)" — most computer science graduates still treat modern hardware like something from the 1990s. -LLVM IR, main optimization techniques from the dragon book, JIT-compilation, Cython, JAX, Numba, Julia, OpenCL, DPC++ and oneAPI, XLA, FPGAs and Verilog, ASICs, TPUs and other AI accelerators. +What I really want to achieve is that performance engineering becomes taught right after introduction to algorithms. Writing the first comprehensive textbook on the subject is a large part of it, and this is why I rush to finish it by the summer so that the colleges can pick it up in the next academic year. But creating a new course requires more than that: you need a balanced curriculum, course infrastructure, lecture slides, lab assignments… so for some time after finishing the main book, I will be working on course materials and tools for *teaching* performance engineering — and I'm looking forward to collaborating with other people who want to make it a reality as well. -Release date: ??? + + +### Part I: Performance Engineering + +The first part covers the basics of computer architecture and optimization of single-threaded algorithms. + +It walks through the main CPU optimization topics such as caching, SIMD, and pipelining, and provides brief examples in C++, followed by large case studies where we usually achieve a significant speedup over some STL algorithm or data structure. + +Planned table of contents: ``` -0. Preface: Why Go Beyond Big O -1. Computer Models +0. Preface +1. Complexity Models 1.1. Modern Hardware - 1.2. The "Speed" of Programming Languages - 1.3. The Relevance of Algorithmic Programming + 1.2. Programming Languages + 1.3. Models of Computation + 1.4. When to Optimize 2. Computer Architecture - 1.1. Introduction to Assembly - 1.2. Control Flow - 1.3. Loop Unrolling - 1.4. Operation Fusion - 1.5. Functions and Recursion - 1.6. Inlining - 1.7. Indirect Branching + 1.1. Instruction Set Architectures + 1.2. Assembly Language + 1.3. Loops and Conditionals + 1.4. Functions and Recursion + 1.5. Indirect Branching + 1.6. Machine Code Layout + 1.7. System Calls + 1.8. Virtualization 3. Instruction-Level Parallelism - 3.1. Pipelining and Hazards - 3.2. Branchless Computing - 3.3. Throughput Computing - 3.4. µOps & Scheduling - 3.5. Theoretical Performance Limits + 3.1. Pipeline Hazards + 3.2. The Cost of Branching + 3.3. Branchless Programming + 3.4. Instruction Tables + 3.5. Instruction Scheduling + 3.6. Throughput Computing + 3.7. Theoretical Performance Limits 4. Compilation - 4.1. Negotiating with Compilers - 4.2. Stitching Programs Together + 4.1. Stages of Compilation + 4.2. Flags and Targets 4.3. Situational Optimizations - 4.4. Contracts and Undefined Behavior - 4.5. Memory Aliasing - 4.6. Arithmetic Optimizations - 4.7. Code Layout - 4.8. Compile-Time Computation + 4.4. Contract Programming + 4.5. Non-Zero-Cost Abstractions + 4.6. Compile-Time Computation + 4.7. Arithmetic Optimizations + 4.8. What Compilers Can and Can't Do 5. Profiling 5.1. Instrumentation 5.2. Statistical Profiling 5.3. Program Simulation 5.4. Machine Code Analyzers - 5.5. Reducing Noise - 5.6. Benchmarking + 5.5. Benchmarking + 5.6. Getting Accurate Results 6. Arithmetic 6.1. Floating-Point Numbers 6.2. Interval Arithmetic @@ -197,32 +138,36 @@ I have something like this in mind: 7.6. Hashing 7.7. Random Number Generation 8. External Memory - 8.1. External Sorting - 8.2. List Ranking - 8.3. Eviction Policies - 8.4. Data Locality - 8.5. Cache Blocking - 8.6. Cache-Oblivious Algorithms -(8.7. B-Trees) -(8.8. Sublinear Algorithms) + 8.1. Memory Hierarchy + 8.2. Virtual Memory + 8.3. External Memory Model + 8.4. External Sorting + 8.5. List Ranking + 8.6. Eviction Policies + 8.7. Cache-Oblivious Algorithms + 8.8. Spacial and Temporal Locality +(8.9. B-Trees) +(8.10. Sublinear Algorithms) +(9.13. Memory Management) 9. RAM & CPU Caches 9.1. Memory Bandwidth - 9.2. Cache Lines and Memory Alignment - 9.3. Bit Fields and Packing - 9.4. Memory Paging - 9.5. Cache Associativity - 9.6. Memory Latency - 9.7. Memory-Level Parallelism - 9.8. Prefetching - 9.9. Pointers and Their Alternatives -(9.10. Memory Management) -(9.11. memcpy and memset) + 9.2. Memory Latency + 9.3. Cache Lines + 9.4. Memory Sharing + 9.5. Memory-Level Parallelism + 9.6. Prefetching + 9.7. Alignment and Packing + 9.8. Pointer Alternatives + 9.9. Cache Associativity + 9.10. Memory Paging + 9.11. AoS and SoA 10. SIMD Parallelism - 10.1. Using SIMD in C/C++ - 10.2. Reductions - 10.3. Auto-Vectorization - 10.4. Data Twiddling - 10.5. SSE & AVX Cookbook + 10.1. Intrinsics and Vector Types + 10.2. Moving Data + 10.3. Reductions + 10.4. Masking and Blending + 10.5. In-Register Shuffles + 10.6. Auto-Vectorization and SPMD 11. Algorithm Case Studies 11.1. Binary GCD (11.2. Prime Number Sieves) @@ -232,19 +177,113 @@ I have something like this in mind: 11.6. Fast Fourier Transform 11.7. Number-Theoretic Transform 11.8. Argmin with SIMD - 11.9. Reading and Writing Integers -(11.10. Reading and Writing Floats) -(11.11. String Searching) - 11.12. Sorting - 11.13. Matrix Multiplication + 11.9. Prefix Sum with SIMD + 11.10. Reading Decimal Integers + 11.11. Writing Decimal Integers +(11.12. Reading and Writing Floats) +(11.13. String Searching) + 11.14. Sorting + 11.15. Matrix Multiplication 12. Data Structure Case Studies 12.1. Binary Search - 12.2. Dynamic Prefix Sum -(12.3. Ordered Trees) -(12.4. Range Minimum Query) - 12.5. Hash Tables -(12.6. Bitmaps) -(12.7. Probabilistic Filters) + 12.2. Static B-Trees +(12.3. Search Trees) + 12.4. Segment Trees +(12.5. Tries) +(12.6. Range Minimum Query) + 12.7. Hash Tables +(12.8. Bitmaps) +(12.9. Probabilistic Filters) ``` -I will probably start refactoring once I'm done with the original plan, but it may start morphing before that. +Among the cool things that we will speed up: + +- 2x faster GCD (compared to `std::gcd`) +- 8-15x faster binary search (compared to `std::lower_bound`) +- 5-10x faster segment trees (compared to Fenwick trees) +- 5x faster hash tables (compared to `std::unordered_map`) +- 2x faster popcount (compared to repeatedly calling `popcnt`) +- 35x faster parsing series of integers (compared to `scanf`) +- ?x faster sorting (compared to `std::sort`) +- 2x faster sum (compared to `std::accumulate`) +- 2-3x faster prefix sum (compared to naive implementation) +- 10x faster argmin (compared to naive implementation) +- 10x faster array searching (compared to `std::find`) +- 15x faster search tree (compared to `std::set`) +- 100x faster matrix multiplication (compared to "for-for-for") +- optimal word-size integer factorization (~0.4ms per 60-bit integer) +- optimal Karatsuba Algorithm +- optimal FFT + +Volume: 450-600 pages +Release date: Q3 2022 + +### Part II: Parallel Algorithms + +Concurrency, models of parallelism, context switching, green threads, concurrent runtimes, cache coherence, synchronization primitives, OpenMP, reductions, scans, list ranking, graph algorithms, lock-free data structures, heterogeneous computing, CUDA, kernels, warps, blocks, matrix multiplication, sorting. + +Volume: 150-200 pages +Release date: 2023-2024? + +### Part III: Distributed Computing + + + +Metworking, message passing, actor model, communication-constrained algorithms, distributed primitives, all-reduce, MapReduce, stream processing, query planning, storage, sharding, compression, distributed databases, consistency, reliability, scheduling, workflow engines, cloud computing. + +Release date: ??? (more likely to be completed than not) + +### Part IV: Software & Hardware + + + +LLVM IR, compiler optimizations & back-end, interpreters, JIT-compilation, Cython, JAX, Numba, Julia, OpenCL, DPC++, oneAPI, XLA, (basic) Verilog, FPGAs, ASICs, TPUs and other AI accelerators. + +Release date: ??? (less likely to be completed than not) + +### Acknowledgements + +The book is largely based on blog posts, research papers, conference talks, and other work authored by a lot of people: + +- [Agner Fog](https://agner.org/optimize/) +- [Daniel Lemire](https://lemire.me/en/#publications) +- [Andrei Alexandrescu](https://erdani.com/index.php/about/) +- [Chandler Carruth](https://twitter.com/chandlerc1024) +- [Wojciech Muła](http://0x80.pl/articles/index.html) +- [Malte Skarupke](https://probablydance.com/) +- [Travis Downs](https://travisdowns.github.io/) +- [Brendan Gregg](https://www.brendangregg.com/blog/index.html) +- [Andreas Abel](http://embedded.cs.uni-saarland.de/abel.php) +- [Jakob Kogler](https://cp-algorithms.com/) +- [Igor Ostrovsky](http://igoro.com/) +- [Steven Pigeon](https://hbfs.wordpress.com/) +- [Denis Bakhvalov](https://easyperf.net/notes/) +- [Paul Khuong](https://pvk.ca/) +- [Pat Morin](https://cglab.ca/~morin/) +- [Victor Eijkhout](https://www.tacc.utexas.edu/about/directory/victor-eijkhout) +- [Robert van de Geijn](https://www.cs.utexas.edu/~rvdg/) +- [Edmond Chow](https://www.cc.gatech.edu/~echow/) +- [Peter Cordes](https://stackoverflow.com/users/224132/peter-cordes) +- [Geoff Langdale](https://branchfree.org/) +- [Matt Kulukundis](https://twitter.com/JuvHarlequinKFM) +- [Georg Sauthoff](https://gms.tf/) +- [Danila Kutenin](https://danlark.org/author/kutdanila/) +- [Ivica Bogosavljević](https://johnysswlab.com/author/ibogi/) +- [Matt Pharr](https://pharr.org/matt/) +- [Jan Wassenberg](https://research.google/people/JanWassenberg/) +- [Marshall Lochbaum](https://mlochbaum.github.io/publications.html) +- [Pavel Zemtsov](https://pzemtsov.github.io/) +- [Gustavo Duarte](https://manybutfinite.com/) +- [Nyaan](https://nyaannyaan.github.io/library/) +- [Nayuki](https://www.nayuki.io/category/programming) +- [Konstantin](http://const.me/) +- [InstLatX64](https://twitter.com/InstLatX64) +- [ridiculous_fish](https://ridiculousfish.com/blog/) +- [Z boson](https://stackoverflow.com/users/2542702/z-boson) +- [Creel](https://www.youtube.com/c/WhatsACreel) + +### Disclaimer: Technology Choices + +The examples in this book use C++, GCC, x86-64, CUDA, and Spark, although the underlying principles conveyed are not specific to them. + +To clear my conscience, I'm not happy with any of these choices: these technologies just happen to be the most widespread and stable at the moment and thus more helpful to the reader. I would have respectively picked C / Rust / [Carbon?](https://github.com/carbon-language/carbon-lang), LLVM, arm, OpenCL, and Dask; maybe there will be a 2nd edition in which some of the tech stack is changed. diff --git a/content/english/hpc/algorithms/argmin.md b/content/english/hpc/algorithms/argmin.md index 1ff6b855..2089d083 100644 --- a/content/english/hpc/algorithms/argmin.md +++ b/content/english/hpc/algorithms/argmin.md @@ -1,7 +1,340 @@ --- title: Argmin with SIMD weight: 7 -draft: true --- -... +Computing the *minimum* of an array is [easily vectorizable](/hpc/simd/reduction), as it is not different from any other reduction: in AVX2, you just need to use a convenient `_mm256_min_epi32` intrinsic as the inner operation. It computes the minimum of two 8-element vectors in one cycle — even faster than in the scalar case, which requires at least a comparison and a conditional move. + +Finding the *index* of that minimum element (*argmin*) is much harder, but it is still possible to vectorize very efficiently. In this section, we design an algorithm that computes the argmin (almost) at the speed of computing the minimum and ~15x faster than the naive scalar approach. + +### Scalar Baseline + +For our benchmark, we create an array of random 32-bit integers, and then repeatedly try to find the index of the minimum among them (the first one if it isn't unique): + +```c++ +const int N = (1 << 16); +alignas(32) int a[N]; + +for (int i = 0; i < N; i++) + a[i] = rand(); +``` + +For the sake of exposition, we assume that $N$ is a power of two, and run all our experiments for $N=2^{13}$ so that the [memory bandwidth](/hpc/cpu-cache/bandwidth) is not a concern. + +To implement argmin in the scalar case, we just need to maintain the index instead of the minimum value: + +```c++ +int argmin(int *a, int n) { + int k = 0; + + for (int i = 0; i < n; i++) + if (a[i] < a[k]) + k = i; + + return k; +} +``` + +It works at around 1.5 GFLOPS — meaning $1.5 \cdot 10^9$ values per second processed on average, or about 0.75 values per cycle (the CPU is clocked at 2GHz). + +Let's compare it to `std::min_element`: + +```c++ +int argmin(int *a, int n) { + int k = std::min_element(a, a + n) - a; + return k; +} +``` + + + +The version from GCC gives ~0.28 GFLOPS — apparently, the compiler couldn't pierce through all the abstractions. Another reminder to never use STL. + +### Vector of Indices + +The problem with vectorizing the scalar implementation is that there is a dependency between consequent iterations. When we optimized [array sum](/hpc/simd/reduction), we faced the same problem, and we solved it by splitting the array into 8 slices, each representing a subset of its indices with the same remainder modulo 8. We can apply the same trick here, except that we also have to take array indices into account. + +When we have the consecutive elements and their indices in vectors, we can process them in parallel using [predication](/hpc/pipelining/branchless): + +```c++ +typedef __m256i reg; + +int argmin(int *a, int n) { + // indices on the current iteration + reg cur = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + // the current minimum for each slice + reg min = _mm256_set1_epi32(INT_MAX); + // its index (argmin) for each slice + reg idx = _mm256_setzero_si256(); + + for (int i = 0; i < n; i += 8) { + // load a new SIMD block + reg x = _mm256_load_si256((reg*) &a[i]); + // find the slices where the minimum is updated + reg mask = _mm256_cmpgt_epi32(min, x); + // update the indices + idx = _mm256_blendv_epi8(idx, cur, mask); + // update the minimum (can also similarly use a "blend" here, but min is faster) + min = _mm256_min_epi32(x, min); + // update the current indices + const reg eight = _mm256_set1_epi32(8); + cur = _mm256_add_epi32(cur, eight); // + // can also use a "blend" here, but min is faster + } + + // find the argmin in the "min" register and return its real index + + int min_arr[8], idx_arr[8]; + + _mm256_storeu_si256((reg*) min_arr, min); + _mm256_storeu_si256((reg*) idx_arr, idx); + + int k = 0, m = min_arr[0]; + + for (int i = 1; i < 8; i++) + if (min_arr[i] < m) + m = min_arr[k = i]; + + return idx_arr[k]; +} +``` + +It works at around 8-8.5 GFLOPS. There is still some inter-dependency between the iterations, so we can optimize it by considering more than 8 elements per iteration and taking advantage of the [instruction-level parallelism](/hpc/simd/reduction#instruction-level-parallelism). + +This would help performance a lot, but not enough to match the speed of computing the minimum (~24 GFLOPS) because there is another bottleneck. On each iteration, we need a load-fused comparison, a load-fused minimum, a blend, and an addition — that is 4 instructions in total to process 8 elements. Since the decode width of this CPU (Zen 2) is just 4, the performance will still be limited by 8 × 2 = 16 GFLOPS even if we somehow got rid of all the other bottlenecks. + +Instead, we will switch to another approach that requires fewer instructions per element. + +### Branches Aren't Scary + +When we run the scalar version, how often do we update the minimum? + +Intuition tells us that, if all the values are drawn independently at random, then the event when the next element is less than all the previous ones shouldn't be frequent. More precisely, it equals the reciprocal of the number of processed elements. Therefore, the expected number of times the `a[i] < a[k]` condition is satisfied equals the sum of the harmonic series: + +$$ +\frac{1}{2} + \frac{1}{3} + \frac{1}{4} + \ldots + \frac{1}{n} = O(\ln(n)) +$$ + +So the minimum is updated around 5 times for a hundred-element array, 7 for a thousand-element, and just 14 for a million-element array — which isn't large at all when looked at as a fraction of all is-new-minimum checks. + +The compiler probably couldn't figure it out on its own, so let's [explicitly provide](/hpc/compilation/situational) this information: + +```c++ +int argmin(int *a, int n) { + int k = 0; + + for (int i = 0; i < n; i++) + if (a[i] < a[k]) [[unlikely]] + k = i; + + return k; +} +``` + +The compiler [optimized the machine code layout](/hpc/architecture/layout), and the CPU is now able to execute the loop at around 2 GFLOPS — a slight but sizeable improvement from 1.5 GFLOPS of the non-hinted loop. + +Here is the idea: if we are only updating the minimum a dozen or so times during the entire computation, we can ditch all the vector-blending and index updating and just maintain the minimum and regularly check if it has changed. Inside this check, we can use however slow method of updating the argmin we want because it will only be called a few times. + +To implement it with SIMD, all we need to do on each iteration is a vector load, a comparison, and a test-if-zero: + +```c++ +int argmin(int *a, int n) { + int min = INT_MAX, idx = 0; + + reg p = _mm256_set1_epi32(min); + + for (int i = 0; i < n; i += 8) { + reg y = _mm256_load_si256((reg*) &a[i]); + reg mask = _mm256_cmpgt_epi32(p, y); + if (!_mm256_testz_si256(mask, mask)) { [[unlikely]] + for (int j = i; j < i + 8; j++) + if (a[j] < min) + min = a[idx = j]; + p = _mm256_set1_epi32(min); + } + } + + return idx; +} +``` + +It already performs at ~8.5 GFLOPS, but now the loop is bottlenecked by the `testz` instruction which only has a throughput of one. The solution is to load two consecutive SIMD blocks and use the minimum of them so that the `testz` effectively processes 16 elements in one go: + +```c++ +int argmin(int *a, int n) { + int min = INT_MAX, idx = 0; + + reg p = _mm256_set1_epi32(min); + + for (int i = 0; i < n; i += 16) { + reg y1 = _mm256_load_si256((reg*) &a[i]); + reg y2 = _mm256_load_si256((reg*) &a[i + 8]); + reg y = _mm256_min_epi32(y1, y2); + reg mask = _mm256_cmpgt_epi32(p, y); + if (!_mm256_testz_si256(mask, mask)) { [[unlikely]] + for (int j = i; j < i + 16; j++) + if (a[j] < min) + min = a[idx = j]; + p = _mm256_set1_epi32(min); + } + } + + return idx; +} +``` + +This version works in ~10 GFLOPS. To remove the other obstacles, we can do two things: + +- Increase the block size to 32 elements to allow for more instruction-level parallelism. +- Optimize the local argmin: instead of calculating its exact location, we can just save the index of the block and then come back at the end and find it just once. This lets us only compute the minimum on each positive check and broadcast it to a vector, which is simpler and much faster. + +With these two optimizations implemented, the performance increases to a whopping ~22 GFLOPS: + +```c++ +int argmin(int *a, int n) { + int min = INT_MAX, idx = 0; + + reg p = _mm256_set1_epi32(min); + + for (int i = 0; i < n; i += 32) { + reg y1 = _mm256_load_si256((reg*) &a[i]); + reg y2 = _mm256_load_si256((reg*) &a[i + 8]); + reg y3 = _mm256_load_si256((reg*) &a[i + 16]); + reg y4 = _mm256_load_si256((reg*) &a[i + 24]); + y1 = _mm256_min_epi32(y1, y2); + y3 = _mm256_min_epi32(y3, y4); + y1 = _mm256_min_epi32(y1, y3); + reg mask = _mm256_cmpgt_epi32(p, y1); + if (!_mm256_testz_si256(mask, mask)) { [[unlikely]] + idx = i; + for (int j = i; j < i + 32; j++) + min = (a[j] < min ? a[j] : min); + p = _mm256_set1_epi32(min); + } + } + + for (int i = idx; i < idx + 31; i++) + if (a[i] == min) + return i; + + return idx + 31; +} +``` + +This is almost as high as it can get as just computing the minimum itself works at around 24-25 GFLOPS. + +The only problem of all these branch-happy SIMD implementations is that they rely on the minimum being updated very infrequently. This is true for random input distributions, but not in the worst case. If we fill the array with a sequence of decreasing numbers, the performance of the last implementation drops to about 2.7 GFLOPS — almost 10 times as slow (although still faster than the scalar code because we only calculate the minimum on each block). + +One way to fix this is to do the same thing that the quicksort-like randomized algorithms do: just shuffle the input yourself and iterate over the array in random order. This lets you avoid this worst-case penalty, but it is tricky to implement due to RNG- and [memory](/hpc/cpu-cache/prefetching)-related issues. There is a simpler solution. + +### Find the Minimum, Then Find the Index + +We know how to [calculate the minimum of an array](/hpc/simd/reduction) fast and how to [find an element in an array](/hpc/simd/masking#searching) fast — so why don't we just separately compute the minimum and then find it? + +```c++ +int argmin(int *a, int n) { + int needle = min(a, n); + int idx = find(a, n, needle); + return idx; +} +``` + +If we implement the two subroutines optimally (check the linked articles), the performance will be ~18 GFLOPS for random arrays and ~12 GFLOPS for decreasing arrays — which makes sense as we are expected to read the array 1.5 and 2 times respectively. This isn't that bad by itself — at least we avoid the 10x worst-case performance penalty — but the problem is that this penalized performance also translates to larger arrays, when we are bottlenecked by the [memory bandwidth](/hpc/cpu-cache/bandwidth) rather than compute. + +Luckily, we already know how to fix it. We can split the array into blocks of fixed size $B$ and compute the minima on these blocks while also maintaining the global minimum. When the minimum on a new block is lower than the global minimum, we update it and also remember the block number of where the global minimum currently is. After we've processed the entire array, we just return to that block and scan through its $B$ elements to find the argmin. + +This way we only process $(N + B)$ elements and don't have to sacrifice neither ½ nor ⅓ of the performance: + +```c++ +const int B = 256; + +// returns the minimum and its first block +pair approx_argmin(int *a, int n) { + int res = INT_MAX, idx = 0; + for (int i = 0; i < n; i += B) { + int val = min(a + i, B); + if (val < res) { + res = val; + idx = i; + } + } + return {res, idx}; +} + +int argmin(int *a, int n) { + auto [needle, base] = approx_argmin(a, n); + int idx = find(a + base, B, needle); + return base + idx; +} +``` + +This results for the final implementation are ~22 and ~19 GFLOPS for random and decreasing arrays respectively. + +The full implementation, including both `min()` and `find()`, is about 100 lines long. [Take a look](https://github.com/sslotin/amh-code/blob/main/argmin/combined.cc) if you want, although it is still far from being production-grade. + +### Summary + +Here are the results combined for all implementations: + +``` +algorithm rand decr reason for the performance difference +----------- ----- ----- ------------------------------------------------------------- +std 0.28 0.28 +scalar 1.54 1.89 efficient branch prediction ++ hinted 1.95 0.75 wrong hint +index 8.17 8.12 +simd 8.51 1.65 scalar-based argmin on each iteration ++ ilp 10.22 1.74 ^ same ++ optimized 22.44 2.70 ^ same, but faster because there are less inter-dependencies +min+find 18.21 12.92 find() has to scan the entire array ++ blocked 22.23 19.29 we still have an optional horizontal minimum every B elements +``` + +Take these results with a grain of salt: the measurements are [quite noisy](/hpc/profiling/noise), they were done for just for two input distributions, for a specific array size ($N=2^{13}$, the size of the L1 cache), for a specific architecture (Zen 2), and for a specific and slightly outdated compiler (GCC 9.3) — the compiler optimizations were also very fragile to little changes in the benchmarking code. + +There are also still some minor things to optimize, but the potential improvement is less than 10% so I didn't bother. One day I may pluck up the courage, optimize the algorithm to the theoretical limit, handle the non-divisible-by-block-size array sizes and non-aligned memory cases, and then re-run the benchmarks properly on many architectures, with p-values and such. In case someone does it before me, please [ping me back](http://sereja.me/). + +### Acknowledgements + +The first, index-based SIMD algorithm was [originally designed](http://0x80.pl/notesen/2018-10-03-simd-index-of-min.html) by Wojciech Muła in 2018. + +Thanks to Zach Wegner for [pointing out](https://twitter.com/zwegner/status/1491520929138151425) that the performance of the Muła's algorithm is improved when implemented manually using intrinsics (I originally used the [GCC vector types](/hpc/simd/intrinsics/#gcc-vector-extensions)). + + + +After publication, I've discovered that [Marshall Lochbaum](https://www.aplwiki.com/wiki/Marshall_Lochbaum), the creator of [BQN](https://mlochbaum.github.io/BQN/), designed a [very similar algorithm](https://forums.dyalog.com/viewtopic.php?f=13&t=1579&sid=e2cbd69817a17a6e7b1f76c677b1f69e#p6239) while he was working on Dyalog APL in 2019. Pay more attention to the world of array programming languages! diff --git a/content/english/hpc/algorithms/factorization.md b/content/english/hpc/algorithms/factorization.md index 4ff8061d..b900eb8c 100644 --- a/content/english/hpc/algorithms/factorization.md +++ b/content/english/hpc/algorithms/factorization.md @@ -1,48 +1,74 @@ --- title: Integer Factorization weight: 3 -draft: true +published: true --- -Integer factorization is interesting because of RSA problem. +The problem of factoring integers into primes is central to computational [number theory](/hpc/number-theory/). It has been [studied](https://www.cs.purdue.edu/homes/ssw/chapter3.pdf) since at least the 3rd century BC, and [many methods](https://en.wikipedia.org/wiki/Category:Integer_factorization_algorithms) have been developed that are efficient for different inputs. -"How big are your numbers?" determines the method to use: +In this case study, we specifically consider the factorization of *word-sized* integers: those on the order of $10^9$ and $10^{18}$. Untypical for this book, in this one, you may actually learn an asymptotically better algorithm: we start with a few basic approaches and gradually build up to the $O(\sqrt[4]{n})$-time *Pollard's rho algorithm* and optimize it to the point where it can factorize 60-bit semiprimes in 0.3-0.4ms and ~3 times faster than the previous state-of-the-art. -- Less than 2^16 or so: Lookup table. -- Less than 2^70 or so: Richard Brent's modification of Pollard's rho algorithm. -- Less than 10^50: Lenstra elliptic curve factorization -- Less than 10^100: Quadratic Sieve -- More than 10^100: General Number Field Sieve + +### Benchmark -and do other computations such as computing the greatest common multiple (given that it is not even so that ) (since $\gcd(n, r) = 1$) - -For all methods, we will implement `find_factor` function which returns one divisor ot 1. You can apply it recurively to get the factorization, so whatever asymptotic you had won't affect it: +For all methods, we will implement `find_factor` function that takes a positive integer $n$ and returns any of its non-trivial divisors (or `1` if the number is prime): ```c++ -typedef uint32_t u32; -typedef uint64_t u64; +// I don't feel like typing "unsigned long long" each time +typedef __uint16_t u16; +typedef __uint32_t u32; +typedef __uint64_t u64; typedef __uint128_t u128; +u64 find_factor(u64 n); +``` + +To find the full factorization, you can apply it to $n$, reduce it, and continue until a new factor can no longer be found: + +```c++ vector factorize(u64 n) { - vector res; - while (int d = find_factor(n); d > 1) // does it work? - res.push_back(d); - return res; + vector factorization; + do { + u64 d = find_factor(n); + factorization.push_back(d); + n /= d; + } while (d != 1); + return factorization; } ``` -## Trial division +After each removed factor, the problem becomes considerably smaller, so the worst-case running time of full factorization is equal to the worst-case running time of a `find_factor` call. + +For many factorization algorithms, including those presented in this section, the running time scales with the smaller prime factor. Therefore, to provide worst-case input, we use *semiprimes:* products of two prime numbers $p \le q$ that are on the same order of magnitude. We generate a $k$-bit semiprime as the product of two random $\lfloor k / 2 \rfloor$-bit primes. + +Since some of the algorithms are inherently randomized, we also tolerate a small (<1%) percentage of false-negative errors (when `find_factor` returns `1` despite number $n$ being composite), although this rate can be reduced to almost zero without significant performance penalties. + +### Trial division + + + +The most basic approach is to try every integer smaller than $n$ as a divisor: + +```c++ +u64 find_factor(u64 n) { + for (u64 d = 2; d < n; d++) + if (n % d == 0) + return d; + return 1; +} +``` -The smallest divisor has to be a prime number. -We remove the factor from the number, and repeat the process. -If we cannot find any divisor in the range $[2; \sqrt{n}]$, then the number itself has to be prime. +We can notice that if $n$ is divided by $d < \sqrt n$, then it is also divided by $\frac{n}{d} > \sqrt n$, and there is no need to check for it separately. This lets us stop trial division early and only check for potential divisors that do not exceed $\sqrt n$: ```c++ u64 find_factor(u64 n) { @@ -53,13 +79,43 @@ u64 find_factor(u64 n) { } ``` +In our benchmark, $n$ is a semiprime, and we always find the lesser divisor, so both $O(n)$ and $O(\sqrt n)$ implementations perform the same and are able to factorize ~2k 30-bit numbers per second — while taking whole 20 seconds to factorize a single 60-bit number. + +### Lookup Table + +Nowadays, you can type `factor 57` in your Linux terminal or Google search bar to get the factorization of any number. But before computers were invented, it was more practical to use *factorization tables:* special books containing factorizations of the first $N$ numbers. + +We can also use this approach to compute these lookup tables [during compile time](/hpc/compilation/precalc/). To save space, we can store only the smallest divisor of a number. Since the smallest divisor does not exceed the $\sqrt n$, we need just one byte per a 16-bit integer: + +```c++ +template +struct Precalc { + unsigned char divisor[N]; + + constexpr Precalc() : divisor{} { + for (int i = 0; i < N; i++) + divisor[i] = 1; + for (int i = 2; i * i < N; i++) + if (divisor[i] == 1) + for (int k = i * i; k < N; k += i) + divisor[k] = i; + } +}; + +constexpr Precalc P{}; + +u64 find_factor(u64 n) { + return P.divisor[n]; +} +``` + +With this approach, we can process 3M 16-bit integers per second, although it would probably [get slower](../hpc/cpu-cache/bandwidth/) for larger numbers. While it requires just a few milliseconds and 64KB of memory to calculate and store the divisors of the first $2^{16}$ numbers, it does not scale well for larger inputs. + ### Wheel factorization -This is an optimization of the trial division. -The idea is the following. -Once we know that the number is not divisible by 2, we don't need to check every other even number. -This leaves us with only $50\%$ of the numbers to check. -After checking 2, we can simply start with 3 and skip every other number. +To save paper space, pre-computer era factorization tables typically excluded numbers divisible by $2$ and $5$, making the factorization table ½ × ⅘ = 0.4 of its original size. In the decimal numeral system, you can quickly determine whether a number is divisible by $2$ or $5$ (by looking at its last digit) and keep dividing the number $n$ by $2$ or $5$ while it is possible, eventually arriving at some entry in the factorization table. + +We can apply a similar trick to trial division by first checking if the number is divisible by $2$ and then only considering odd divisors: ```c++ u64 find_factor(u64 n) { @@ -72,24 +128,29 @@ u64 find_factor(u64 n) { } ``` -This method can be extended. -If the number is not divisible by 3, we can also ignore all other multiples of 3 in the future computations. -So we only need to check the numbers $5, 7, 11, 13, 17, 19, 23, \dots$. -We can observe a pattern of these remaining numbers. -We need to check all numbers with $d \bmod 6 = 1$ and $d \bmod 6 = 5$. -So this leaves us with only $33.3\%$ percent of the numbers to check. -We can implement this by checking the primes 2 and 3 first, and then start checking with 5 and alternatively skip 1 or 3 numbers. +With 50% fewer divisions to perform, this algorithm works twice as fast. + +This method can be extended: if the number is not divisible by $3$, we can also ignore all multiples of $3$, and the same goes for all other divisors. The problem is, as we increase the number of primes to exclude, it becomes less straightforward to iterate only over the numbers not divisible by them as they follow an irregular pattern — unless the number of primes is small. + +For example, if we consider $2$, $3$, and $5$, then, among the first $90$ numbers, we only need to check: + +```center +(1,) 7, 11, 13, 17, 19, 23, 29, +31, 37, 41, 43, 47, 49, 53, 59, +61, 67, 71, 73, 77, 79, 83, 89… +``` + +You can notice a pattern: the sequence repeats itself every $30$ numbers. This is not surprising since the remainder modulo $2 \times 3 \times 5 = 30$ is all we need to determine whether a number is divisible by $2$, $3$, or $5$. This means that we only need to check $8$ numbers with specific remainders out of every $30$, proportionally improving the performance: ```c++ u64 find_factor(u64 n) { for (u64 d : {2, 3, 5}) if (n % d == 0) return d; - u64 increments[] = {0, 4, 6, 10, 12, 16, 22, 24}; - u64 sum = 30; - for (u64 d = 7; d * d <= n; d += sum) { - for (u64 k = 0; k < 8; k++) { - u64 x = d + increments[k]; + u64 offsets[] = {0, 4, 6, 10, 12, 16, 22, 24}; + for (u64 d = 7; d * d <= n; d += 30) { + for (u64 offset : offsets) { + u64 x = d + offset; if (n % x == 0) return x; } @@ -98,98 +159,290 @@ u64 find_factor(u64 n) { } ``` -We can extend this even further. -Here is an implementation for the prime number 2, 3 and 5. -It's convenient to use an array to store how much we have to skip. +As expected, it works $\frac{30}{8} = 3.75$ times faster than the naive trial division, processing about 7.6k 30-bit numbers per second. The performance can be improved further by considering more primes, but the returns are diminishing: adding a new prime $p$ reduces the number of iterations by $\frac{1}{p}$ but increases the size of the skip-list by a factor of $p$, requiring proportionally more memory. -### Lookup table +### Precomputed Primes -We will choose to store smallest factors of first $2^16$ — because this way they all fit in just one byte, so we are sort of saving on memory here. +If we keep increasing the number of primes in wheel factorization, we eventually exclude all composite numbers and only check for prime factors. In this case, we don't need this array of offsets but just the array of primes: ```c++ -template -struct Precalc { - char divisor[N]; +const int N = (1 << 16); - constexpr Precalc() : divisor{} { - for (int i = 0; i < N; i++) - divisor[i] = 1; - for (int i = 2; i * i < N; i++) - if (divisor[i] == 1) - for (int k = i * i; k < N; k += i) - divisor[k] = i; +struct Precalc { + u16 primes[6542]; // # of primes under N=2^16 + + constexpr Precalc() : primes{} { + bool marked[N] = {}; + int n_primes = 0; + + for (int i = 2; i < N; i++) { + if (!marked[i]) { + primes[n_primes++] = i; + for (int j = 2 * i; j < N; j += i) + marked[j] = true; + } + } } }; -constexpr Precalc precalc{}; +constexpr Precalc P{}; u64 find_factor(u64 n) { - return precalc.divisor[n]; + for (u16 p : P.primes) + if (n % p == 0) + return p; + return 1; } ``` +This approach lets us process almost 20k 30-bit integers per second, but it does not work for larger (64-bit) numbers unless they have small ($< 2^{16}$) factors. + +Note that this is actually an asymptotic optimization: there are $O(\frac{n}{\ln n})$ primes among the first $n$ numbers, so this algorithm performs $O(\frac{\sqrt n}{\ln \sqrt n})$ operations, while wheel factorization only eliminates a large but constant fraction of divisors. If we extend it to 64-bit numbers and precompute every prime under $2^{32}$ (storing which would require several hundred megabytes of memory), the relative speedup would grow by a factor of $\frac{\ln \sqrt{n^2}}{\ln \sqrt n} = 2 \cdot \frac{1/2}{1/2} \cdot \frac{\ln n}{\ln n} = 2$. + +All variants of trial division, including this one, are bottlenecked by the speed of integer division, which can be [optimized](/hpc/arithmetic/division/) if we know the divisors in advance and allow for some additional precomputation. In our case, it is suitable to use [the Lemire division check](/hpc/arithmetic/division/#lemire-reduction): + +```c++ +// ...precomputation is the same as before, +// but we store the reciprocal instead of the prime number itself +u64 magic[6542]; +// for each prime i: +magic[n_primes++] = u64(-1) / i + 1; + +u64 find_factor(u64 n) { + for (u64 m : P.magic) + if (m * n < m) + return u64(-1) / m + 1; + return 1; +} +``` + +This makes the algorithm ~18x faster: we can now factorize **~350k** 30-bit numbers per second, which is actually the most efficient algorithm we have for this number range. While it can probably be optimized even further by performing these checks in parallel with [SIMD](/hpc/simd), we will stop there and try a different, asymptotically better approach. + ### Pollard's Rho Algorithm -The algorithm is probabilistic. This means that it may or may not work. You would also need to + + +Pollard's rho is a randomized $O(\sqrt[4]{n})$ integer factorization algorithm that makes use of the [birthday paradox](https://en.wikipedia.org/wiki/Birthday_problem): + +> One only needs to draw $d = \Theta(\sqrt{n})$ random numbers between $1$ and $n$ to get a collision with high probability. + +The reasoning behind it is that each of the $d$ added element has a $\frac{d}{n}$ chance of colliding with some other element, implying that the expected number of collisions is $\frac{d^2}{n}$. If $d$ is asymptotically smaller than $\sqrt n$, then this ratio grows to zero as $n \to \infty$, and to infinity otherwise. + +Consider some function $f(x)$ that takes a remainder $x \in [0, n)$ and maps it to some other remainder of $n$ in a way that seems random from the number theory point of view. Specifically, we will use $f(x) = x^2 + 1 \bmod n$, which is random enough for our purposes. + +Now, consider a graph where each number-vertex $x$ has an edge pointing to $f(x)$. Such graphs are called *functional*. In functional graphs, the "trajectory" of any element — the path we walk if we start from that element and keep following the edges — is a path that eventually loops around (because the set of vertices is limited, and at some point, we have to go to a vertex we have already visited). + +![The trajectory of an element resembles the greek letter ρ (rho), which is what the algorithm is named after](../img/rho.jpg) + +Consider a trajectory of some particular element $x_0$: + +$$ +x_0, \; f(x_0), \; f(f(x_0)), \; \ldots +$$ + +Let's make another sequence out of this one by reducing each element modulo $p$, the smallest prime divisor of $n$. -> В мультимножество нужно добавить $O(\sqrt{n})$ случайных чисел от 1 до $n$, чтобы какие-то два совпали. +**Lemma.** The expected length of the reduced sequence before it turns into a cycle is $O(\sqrt[4]{n})$. -## $\rho$-алгоритм Полларда +**Proof:** Since $p$ is the smallest divisor, $p \leq \sqrt n$. Each time we follow a new edge, we essentially generate a random number between $0$ and $p$ (we treat $f$ as a "deterministically-random" function). The birthday paradox states that we only need to generate $O(\sqrt p) = O(\sqrt[4]{n})$ numbers until we get a collision and thus enter a loop. -Итак, мы хотим факторизовать число $n$. Предположим, что $n = p q$ и $p \approx q$. Понятно, что труднее случая, наверное, нет. Алгоритм итеративно ищет наименьший делитель и таким образом сводит задачу к как минимум в два раза меньшей. +Since we don't know $p$, this mod-$p$ sequence is only imaginary, but if find a cycle in it — that is, $i$ and $j$ such that -Возьмём произвольную «достаточно случайную» с точки зрения теории чисел функцию. Например $f(x) = (x+1)^2 \mod n$. +$$ +f^i(x_0) \equiv f^j(x_0) \pmod p +$$ -Граф, в котором из каждой вершины есть единственное ребро $x \to f(x)$, называется *функциональным*. Если в нём нарисовать «траекторию» произвольного элемента — какой-то путь, превращающийся в цикл — то получится что-то похожее на букву $\rho$ (ро). Алгоритм из-за этого так и назван. +then we can also find $p$ itself as -![](https://upload.wikimedia.org/wikipedia/commons/4/47/Pollard_rho_cycle.jpg) +$$ +p = \gcd(|f^i(x_0) - f^j(x_0)|, n) +$$ -Рассмотрим траекторию какого-нибудь элемента $x_0$: {$x_0$, $f(x_0)$, $f(f(x_0))$, $\ldots$}. Сделаем из неё новую последовательность, мысленно взяв каждый элемент по модулю $p$ — наименьшего из простых делителей $n$. +The algorithm itself just finds this cycle and $p$ using this GCD trick and Floyd's "[tortoise and hare](https://en.wikipedia.org/wiki/Cycle_detection#Floyd's_tortoise_and_hare)" algorithm: we maintain two pointers $i$ and $j = 2i$ and check that -**Утверждение**. Ожидаемая длина цикла в этой последовательности $O(\sqrt[4]{n})$. +$$ +\gcd(|f^i(x_0) - f^j(x_0)|, n) \neq 1 +$$ -*Доказательство:* так как $p$ — меньший делитель, то $p \leq \sqrt n$. Теперь просто подставлим в следствие из парадокса дней рождений: в множество нужно добавить $O(\sqrt{p}) = O(\sqrt[4]{n})$ элементов, чтобы какие-то два совпали, а значит последовательность зациклилась. +which is equivalent to comparing $f^i(x_0)$ and $f^j(x_0)$ modulo $p$. Since $j$ (hare) is increasing at twice the rate of $i$ (tortoise), their difference is increasing by $1$ each iteration and eventually will become equal to (or a multiple of) the cycle length, with $i$ and $j$ pointing to the same elements. And as we proved half a page ago, reaching a cycle would only require $O(\sqrt[4]{n})$ iterations: -Если мы найдём цикл в такой последовательности — то есть такие $i$ и $j$, что $f^i(x_0) \equiv f^j(x_0) \pmod p$ — то мы сможем найти и какой-то делитель $n$, а именно $\gcd(|f^i(x_0) - f^j(x_0)|, n)$ — это число меньше $n$ и делится на $p$. +```c++ +u64 f(u64 x, u64 mod) { + return ((u128) x * x + 1) % mod; +} + +u64 diff(u64 a, u64 b) { + // a and b are unsigned and so is their difference, so we can't just call abs(a - b) + return a > b ? a - b : b - a; +} + +const u64 SEED = 42; + +u64 find_factor(u64 n) { + u64 x = SEED, y = SEED, g = 1; + while (g == 1) { + x = f(f(x, n), n); // advance x twice + y = f(y, n); // advance y once + g = gcd(diff(x, y)); + } + return g; +} +``` + +While it processes only ~25k 30-bit integers — which is almost 15 times slower than by checking each prime using a fast division trick — it dramatically outperforms every $\tilde{O}(\sqrt n)$ algorithm for 60-bit numbers, factorizing around 90 of them per second. + +### Pollard-Brent Algorithm -Алгоритм по сути находит цикл в этой последовательности, используя для этого стандартный алгоритм («черепаха и заяц»): будем поддерживать два удаляющихся друг от друга указателя $i$ и $j$ ($i = 2j$) и проверять, что $f^i(x_0) \equiv f^j(x_0) \pmod p$, что эквивалентно проверке $\gcd(|f^i(x_0) - f^j(x_0)|, n) \not \in \{ 1, n \}$. +Floyd's cycle-finding algorithm has a problem in that it moves iterators more than necessary: at least half of the vertices are visited one additional time by the slower iterator. + +One way to solve it is to memorize the values $x_i$ that the faster iterator visits and, every two iterations, compute the GCD using the difference of $x_i$ and $x_{\lfloor i / 2 \rfloor}$. But it can also be done without extra memory using a different principle: the tortoise doesn't move on every iteration, but it gets reset to the value of the faster iterator when the iteration number becomes a power of two. This lets us save additional iterations while still using the same GCD trick to compare $x_i$ and $x_{2^{\lfloor \log_2 i \rfloor}}$ on each iteration: ```c++ -typedef long long ll; - -inline ll f(ll x) { return (x+1)*(x+1); } - -ll find_divisor(ll n, ll seed = 1) { - ll x = seed, y = seed; - ll divisor = 1; - while (divisor == 1 || divisor == n) { - // двигаем первый указатель на шаг - y = f(y) % n; - // а второй -- на два - x = f(f(x) % n) % n; - // пытаемся найти общий делитель - divisor = __gcd(abs(x-y), n); +u64 find_factor(u64 n) { + u64 x = SEED; + + for (int l = 256; l < (1 << 20); l *= 2) { + u64 y = x; + for (int i = 0; i < l; i++) { + x = f(x, n); + if (u64 g = gcd(diff(x, y), n); g != 1) + return g; + } } - return divisor; + + return 1; } ``` -Так как алгоритм рандомизированный, при полной реализации нужно учитывать разные детали. Например, что иногда делитель не находится (нужно запускать несколько раз), или что при попытке факторизовать простое число он будет работать за $O(\sqrt n)$ (нужно добавить отсечение по времени). +Note that we also set an upper limit on the number of iterations so that the algorithm finishes in a reasonable amount of time and returns `1` if $n$ turns out to be a prime. + +It actually does *not* improve performance and even makes the algorithm ~1.5x *slower*, which probably has something to do with the fact that $x$ is stale. It spends most of the time computing the GCD and not advancing the iterator — in fact, the time requirement of this algorithm is currently $O(\sqrt[4]{n} \log n)$ because of it. + +Instead of [optimizing the GCD itself](../gcd), we will optimize the number of its invocations. We can use the fact that if one of $a$ and $b$ contains factor $p$, then $a \cdot b \bmod n$ will also contain it, so instead of computing $\gcd(a, n)$ and $\gcd(b, n)$, we can compute $\gcd(a \cdot b \bmod n, n)$. This way, we can group the calculations of GCP in groups of $M = O(\log n)$ we remove $\log n$ out of the asymptotic: + +```c++ +const int M = 1024; + +u64 find_factor(u64 n) { + u64 x = SEED; + + for (int l = M; l < (1 << 20); l *= 2) { + u64 y = x, p = 1; + for (int i = 0; i < l; i += M) { + for (int j = 0; j < M; j++) { + y = f(y, n); + p = (u128) p * diff(x, y) % n; + } + if (u64 g = gcd(p, n); g != 1) + return g; + } + } + + return 1; +} +``` + +Now it performs 425 factorizations per second, bottlenecked by the speed of modulo. + +### Optimizing the Modulo + +The final step is to apply [Montgomery multiplication](/hpc/number-theory/montgomery/). Since the modulo is constant, we can perform all computations — advancing the iterator, multiplication, and even computing the GCD — in the Montgomery space where reduction is cheap: + +```c++ +struct Montgomery { + u64 n, nr; + + Montgomery(u64 n) : n(n) { + nr = 1; + for (int i = 0; i < 6; i++) + nr *= 2 - n * nr; + } + + u64 reduce(u128 x) const { + u64 q = u64(x) * nr; + u64 m = ((u128) q * n) >> 64; + return (x >> 64) + n - m; + } + + u64 multiply(u64 x, u64 y) { + return reduce((u128) x * y); + } +}; + +u64 f(u64 x, u64 a, Montgomery m) { + return m.multiply(x, x) + a; +} + +const int M = 1024; + +u64 find_factor(u64 n, u64 x0 = 2, u64 a = 1) { + Montgomery m(n); + u64 x = SEED; + + for (int l = M; l < (1 << 20); l *= 2) { + u64 y = x, p = 1; + for (int i = 0; i < l; i += M) { + for (int j = 0; j < M; j++) { + x = f(x, m); + p = m.multiply(p, diff(x, y)); + } + if (u64 g = gcd(p, n); g != 1) + return g; + } + } + + return 1; +} +``` + +This implementation can processes around 3k 60-bit integers per second, which is ~3x faster than what [PARI](https://pari.math.u-bordeaux.fr/) / [SageMath's `factor`](https://doc.sagemath.org/html/en/reference/structure/sage/structure/factorization.html) / `cat semiprimes.txt | time factor` measures. + +### Further Improvements + +**Optimizations.** There is still a lot of potential for optimization in our implementation of the Pollard's algorithm: + +- We could probably use a better cycle-finding algorithm, exploiting the fact that the graph is random. For example, there is little chance that we enter the loop in within the first few iterations (the length of the cycle and the path we walk before entering it should be equal in expectation since before we loop around, we choose the vertex of the path we've walked independently), so we may just advance the iterator for some time before starting the trials with the GCD trick. +- Our current approach is bottlenecked by advancing the iterator (the latency of Montgomery multiplication is much higher than its reciprocal throughput), and while we are waiting for it to complete, we could perform more than just one trial using the previous values. +- If we run $p$ independent instances of the algorithm with different seeds in parallel and stop when one of them finds the answer, it would finish $\sqrt p$ times faster (the reasoning is similar to the Birthday paradox; try to prove it yourself). We don't have to use multiple cores for that: there is a lot of untapped [instruction-level parallelism](/hpc/pipelining/), so we could concurrently run two or three of the same operations on the same thread, or use [SIMD](/hpc/simd) instructions to perform 4 or 8 multiplications in parallel. -### Brent's Method +I would not be surprised to see another 3x improvement and throughput of ~10k/sec. If you [implement](https://github.com/sslotin/amh-code/tree/main/factor) some of these ideas, please [let me know](http://sereja.me/). -Another idea is to accumulate the product and instead of calculating GCD on each step to calculate it every log n steps. + -### Optimizing division +**Errors.** Another aspect that we need to handle in a practical implementation is possible errors. Our current implementation has a 0.7% error rate for 60-bit integers, and it grows higher if the numbers are lower. These errors come from three main sources: -The next step is to actually apply Montgomery Multiplication. +- A cycle simply not being found (the algorithm is inherently random, and there is no guarantee that it will be found). In this case, we need to perform a primality test and optionally start again. +- The `p` variable becoming zero (because both $p$ and $q$ can get into the product). It becomes increasingly more likely as we decrease size of the inputs or increase the constant `M`. In this case, we need to either restart the process or (better) roll back the last $M$ iterations and perform the trials one by one. +- Overflows in the Montgomery multiplication. Our current implementation is pretty loose with them, and if $n$ is large, we need to add more `x > mod ? x - mod : x` kind of statements to deal with overflows. -This is exactly the type of problem when we need specific knowledge, because we have 64-bit modulo by not-compile-constants, and compiler can't really do much to optimize it. +**Larger numbers.** These issues become less important if we exclude small numbers and numbers with small prime factors using the algorithms we've implemented before. In general, the optimal approach should depend on the size of the numbers: -... +- Smaller than $2^{16}$: use a lookup table; +- Smaller than $2^{32}$: use a list of precomputed primes with a fast divisibility check; +- Smaller than $2^{64}$ or so: use Pollard's rho algorithm with Montgomery multiplication; +- Smaller than $10^{50}$: switch to [Lenstra elliptic curve factorization](https://en.wikipedia.org/wiki/Lenstra_elliptic-curve_factorization); +- Smaller than $10^{100}$: switch to [Quadratic Sieve](https://en.wikipedia.org/wiki/Quadratic_sieve); +- Larger than $10^{100}$: switch to [General Number Field Sieve](https://en.wikipedia.org/wiki/General_number_field_sieve). -## Further optimizations + -Существуют также [субэкспоненциальные](https://ru.wikipedia.org/wiki/%D0%A4%D0%B0%D0%BA%D1%82%D0%BE%D1%80%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D1%8F_%D1%86%D0%B5%D0%BB%D1%8B%D1%85_%D1%87%D0%B8%D1%81%D0%B5%D0%BB#%D0%A1%D1%83%D0%B1%D1%8D%D0%BA%D1%81%D0%BF%D0%BE%D0%BD%D0%B5%D0%BD%D1%86%D0%B8%D0%B0%D0%BB%D1%8C%D0%BD%D1%8B%D0%B5_%D0%B0%D0%BB%D0%B3%D0%BE%D1%80%D0%B8%D1%82%D0%BC%D1%8B), но не полиномиальные алгоритмы факторизации. Человечество [умеет](https://en.wikipedia.org/wiki/Integer_factorization_records) факторизовывать числа порядка $2^{200}$. +The last three approaches are very different from what we've been doing and require much more advanced number theory, and they deserve an article (or a full-length university course) of their own. diff --git a/content/english/hpc/algorithms/gcd.md b/content/english/hpc/algorithms/gcd.md index 726a4073..6a4f8ca7 100644 --- a/content/english/hpc/algorithms/gcd.md +++ b/content/english/hpc/algorithms/gcd.md @@ -14,7 +14,7 @@ $$ \gcd(a, b) = \max_{g: \; g|a \, \land \, g | b} g $$ -You probably already know this algorithm from a CS textbook, but let me briefly remind it anyway. It is based on the following formula, assuming that $a > b$: +You probably already know this algorithm from a CS textbook, but I will summarize it here. It is based on the following formula, assuming that $a > b$: $$ \gcd(a, b) = \begin{cases} @@ -27,7 +27,7 @@ This is true, because if $g = \gcd(a, b)$ divides both $a$ and $b$, it should al The formula above is essentially the algorithm itself: you can simply apply it recursively, and since each time one of the arguments strictly decreases, it will eventually converge to the $b = 0$ case. -The textbook also probably mentioned that the worst possible input to Euclid's algorithm — the one that maximizes the total number of steps — are consecutive Fibonacci numbers, and since they grow exponentially, the running time of the algorithm is logarithmic in the worst case. This is also true for its *average* running time, if we define it as the expected number os steps for pairs of uniformly distributed integers. [The wikipedia article](https://en.wikipedia.org/wiki/Euclidean_algorithm) also has a cryptic derivation of a more precise $0.84 \cdot \ln n$ asymptotic estimate. +The textbook also probably mentioned that the worst possible input to Euclid's algorithm — the one that maximizes the total number of steps — are consecutive Fibonacci numbers, and since they grow exponentially, the running time of the algorithm is logarithmic in the worst case. This is also true for its *average* running time if we define it as the expected number of steps for pairs of uniformly distributed integers. [The Wikipedia article](https://en.wikipedia.org/wiki/Euclidean_algorithm) also has a cryptic derivation of a more precise $0.84 \cdot \ln n$ asymptotic estimate. ![You can see bright blue lines at the proportions of the golden ratio](../img/euclid.svg) @@ -87,11 +87,7 @@ loop: jne loop ``` -If you run `perf` on it, you will see that it spends ~90% of the time on the `idiv` line. What's happening here? - -In short: division works very poorly on x86 and computers in general. Integer division is notoriously hard to implement in hardware. The circuitry takes a lot of space in the ALU, the computation has a lot of stages, and as a result `div` and its siblings routinely take 10-20 cycles to complete. - -Since nobody wants to duplicate all this mess for a separate modulo operation, the `div` instruction serves both purposes. To perform an integer division, you need to put the dividend *specifically* in the `eax` register and call `div` with the divisor as its sole operand. After this, the quotient will be stored in `eax` and the remainder will be stored in `edx`, with latency being slightly less on smaller data type sizes. +If you run [perf](/hpc/profiling/events) on it, you will see that it spends ~90% of the time on the `idiv` line. This isn't surprising: general [integer division](/hpc/arithmetic/division) works notoriously slow on all computers, including x86. But there is one kind of division that works well in hardware: division by a power of 2. @@ -99,18 +95,18 @@ But there is one kind of division that works well in hardware: division by a pow The *binary GCD algorithm* was discovered around the same time as Euclid's, but on the other end of the civilized world, in ancient China. In 1967, it was rediscovered by Josef Stein for use in computers that either don't have division instruction or have a very slow one — it wasn't uncommon for CPUs of that era to use hundreds or thousands of cycles for rare or complex operations. -Analagous to the Euclidean algorithm, it is based on a few similar observations: +Analogous to the Euclidean algorithm, it is based on a few similar observations: 1. $\gcd(0, b) = b$ and symmetrically $\gcd(a, 0) = a$; 2. $\gcd(2a, 2b) = 2 \cdot \gcd(a, b)$; 3. $\gcd(2a, b) = \gcd(a, b)$ if $b$ is odd and symmetrically $\gcd(a, b) = \gcd(a, 2b)$ if $a$ is odd; 4. $\gcd(a, b) = \gcd(|a − b|, \min(a, b))$, if $a$ and $b$ are both odd. -Likewise, the algorithm itself is just repeated application of these identities. +Likewise, the algorithm itself is just a repeated application of these identities. -Its running time is still logarithmic, which is even easier to show because in each of these identities one of the arguments is divided by 2 — except for the last case, in which the new first argument, an absolute difference of two odd numbers, will be even and thus will be divided by 2 on the next iteration. +Its running time is still logarithmic, which is even easier to show because in each of these identities one of the arguments is divided by 2 — except for the last case, in which the new first argument, the absolute difference of two odd numbers, is guaranteed to be even and thus will be divided by 2 on the next iteration. -What makes this algorithm especially interesting to us is that the only arithmetic operations it uses are binary shifts, comparisons and subtractions, all of which typically take just one cycle. +What makes this algorithm especially interesting to us is that the only arithmetic operations it uses are binary shifts, comparisons, and subtractions, all of which typically take just one cycle. ### Implementation @@ -137,9 +133,9 @@ int gcd(int a, int b) { } ``` -Let's run it, and… it sucks. The difference in speed compared to `std::gcd` is indeed 2x, but on the other side of equation. This is mainly because of all the branching needed to differentiate between the cases. Let's start optimizing. +Let's run it, and… it sucks. The difference in speed compared to `std::gcd` is indeed 2x, but on the other side of the equation. This is mainly because of all the branching needed to differentiate between the cases. Let's start optimizing. -First, let's replace all divisions by 2 with divisions by whichever highest power of 2 we can. We can do it efficiently with `__builtin_ctz`, the "count trailing zeros" instruction available on modern CPUs. Whenever we are supposed to divide by 2 in the original algorithm, we will call this function instead, which will give us the exact amount to right-shift the number by. Assuming that the we are dealing with large random numbers, this is expected to decrease the number of iterations by almost a factor 2, because $1 + \frac{1}{2} + \frac{1}{4} + \frac{1}{8} + \ldots \to 2$. +First, let's replace all divisions by 2 with divisions by whichever highest power of 2 we can. We can do it efficiently with `__builtin_ctz`, the "count trailing zeros" instruction available on modern CPUs. Whenever we are supposed to divide by 2 in the original algorithm, we will call this function instead, which will give us the exact number of bits to right-shift the number by. Assuming that the we are dealing with large random numbers, this is expected to decrease the number of iterations by almost a factor 2, because $1 + \frac{1}{2} + \frac{1}{4} + \frac{1}{8} + \ldots \to 2$. Second, we can notice that condition 2 can now only be true once — in the very beginning — because every other identity leaves at least one of the numbers odd. Therefore we can handle this case just once in the beginning and not consider it in the main loop. @@ -190,7 +186,7 @@ loop: Let's draw the dependency graph of this loop: -@@ + -Modern processors can execute many instructions in parallel, essentially meaning that the true "cost" of this computation is roughly the sum of latencies on its critical path: in this case it is the total latency of diff, abs, ctz and shift. +![](../img/gcd-dependency1.png) -We can decrease this latency using the fact that we can actually calculate `ctz` using just `diff = a - b`, because a negative number divisible by $2^k$ still has $k$ zeros at the end. This lets us not wait for `max(diff, -diff)` to be computed first, resulting in a shorter graph like this: +Modern processors can execute many instructions in parallel, essentially meaning that the true "cost" of this computation is roughly the sum of latencies on its critical path. In this case, it is the total latency of `diff`, `abs`, `ctz`, and `shift`. -@@ +We can decrease this latency using the fact that we can actually calculate `ctz` using just `diff = a - b`, because a [negative number](../hpc/arithmetic/integer/#signed-integers) divisible by $2^k$ still has $k$ zeros at the end of its binary representation. This lets us not wait for `max(diff, -diff)` to be computed first, resulting in a shorter graph like this: + + + +![](../img/gcd-dependency2.png) Hopefully you will be less confused when you think about how the final code will be executed: @@ -252,9 +252,9 @@ int gcd(int a, int b) { } ``` -It runs in 91ns — which is good enough to leave it there. +It runs in 91ns, which is good enough to leave it there. -If somebody wants to try to shove off a few more nanoseconds by re-writing assembly by hand or trying a lookup table to save a few last iterations, please [let me know](http://sereja.me/). +If somebody wants to try to shave off a few more nanoseconds by rewriting the assembly by hand or trying a lookup table to save a few last iterations, please [let me know](http://sereja.me/). ### Acknowledgements diff --git a/content/english/hpc/algorithms/img/column-major.jpg b/content/english/hpc/algorithms/img/column-major.jpg new file mode 100644 index 00000000..675d0b85 Binary files /dev/null and b/content/english/hpc/algorithms/img/column-major.jpg differ diff --git a/content/english/hpc/algorithms/img/gcd-dependency1.png b/content/english/hpc/algorithms/img/gcd-dependency1.png new file mode 100644 index 00000000..4e58904c Binary files /dev/null and b/content/english/hpc/algorithms/img/gcd-dependency1.png differ diff --git a/content/english/hpc/algorithms/img/gcd-dependency2.png b/content/english/hpc/algorithms/img/gcd-dependency2.png new file mode 100644 index 00000000..b045ada4 Binary files /dev/null and b/content/english/hpc/algorithms/img/gcd-dependency2.png differ diff --git a/content/english/hpc/algorithms/img/mm-blas.svg b/content/english/hpc/algorithms/img/mm-blas.svg new file mode 100644 index 00000000..5027faef --- /dev/null +++ b/content/english/hpc/algorithms/img/mm-blas.svg @@ -0,0 +1,1570 @@ + + + + + + + + 2022-04-05T01:19:43.486396 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/mm-blocked-barplot.svg b/content/english/hpc/algorithms/img/mm-blocked-barplot.svg new file mode 100644 index 00000000..93334ac1 --- /dev/null +++ b/content/english/hpc/algorithms/img/mm-blocked-barplot.svg @@ -0,0 +1,1402 @@ + + + + + + + + 2022-04-05T01:18:41.689702 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/mm-blocked-plot.svg b/content/english/hpc/algorithms/img/mm-blocked-plot.svg new file mode 100644 index 00000000..87dda835 --- /dev/null +++ b/content/english/hpc/algorithms/img/mm-blocked-plot.svg @@ -0,0 +1,1474 @@ + + + + + + + + 2022-04-05T01:18:54.049300 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/mm-kernel-barplot.svg b/content/english/hpc/algorithms/img/mm-kernel-barplot.svg new file mode 100644 index 00000000..834d8b39 --- /dev/null +++ b/content/english/hpc/algorithms/img/mm-kernel-barplot.svg @@ -0,0 +1,1277 @@ + + + + + + + + 2022-04-05T01:18:16.721432 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/mm-kernel-plot.svg b/content/english/hpc/algorithms/img/mm-kernel-plot.svg new file mode 100644 index 00000000..99f9315a --- /dev/null +++ b/content/english/hpc/algorithms/img/mm-kernel-plot.svg @@ -0,0 +1,1385 @@ + + + + + + + + 2022-04-05T01:18:30.773700 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/mm-noalloc.svg b/content/english/hpc/algorithms/img/mm-noalloc.svg new file mode 100644 index 00000000..a4911ea0 --- /dev/null +++ b/content/english/hpc/algorithms/img/mm-noalloc.svg @@ -0,0 +1,1344 @@ + + + + + + + + 2022-04-05T01:19:35.314892 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/mm-vectorized-barplot.svg b/content/english/hpc/algorithms/img/mm-vectorized-barplot.svg new file mode 100644 index 00000000..610d8276 --- /dev/null +++ b/content/english/hpc/algorithms/img/mm-vectorized-barplot.svg @@ -0,0 +1,1140 @@ + + + + + + + + 2022-04-05T01:17:55.289785 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/mm-vectorized-plot.svg b/content/english/hpc/algorithms/img/mm-vectorized-plot.svg new file mode 100644 index 00000000..7374f73f --- /dev/null +++ b/content/english/hpc/algorithms/img/mm-vectorized-plot.svg @@ -0,0 +1,1379 @@ + + + + + + + + 2022-04-05T01:18:01.560593 + image/svg+xml + + + Matplotlib v3.5.1, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/prefix-blocked.svg b/content/english/hpc/algorithms/img/prefix-blocked.svg new file mode 100644 index 00000000..a91c86e0 --- /dev/null +++ b/content/english/hpc/algorithms/img/prefix-blocked.svg @@ -0,0 +1,1375 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/prefix-interleaved-prefetch.svg b/content/english/hpc/algorithms/img/prefix-interleaved-prefetch.svg new file mode 100644 index 00000000..672e2e42 --- /dev/null +++ b/content/english/hpc/algorithms/img/prefix-interleaved-prefetch.svg @@ -0,0 +1,1672 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/prefix-interleaved.svg b/content/english/hpc/algorithms/img/prefix-interleaved.svg new file mode 100644 index 00000000..db3ffe10 --- /dev/null +++ b/content/english/hpc/algorithms/img/prefix-interleaved.svg @@ -0,0 +1,1593 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/prefix-nontemporal.svg b/content/english/hpc/algorithms/img/prefix-nontemporal.svg new file mode 100644 index 00000000..b81d8ad3 --- /dev/null +++ b/content/english/hpc/algorithms/img/prefix-nontemporal.svg @@ -0,0 +1,1766 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/prefix-outline.png b/content/english/hpc/algorithms/img/prefix-outline.png new file mode 100644 index 00000000..66c0ba82 Binary files /dev/null and b/content/english/hpc/algorithms/img/prefix-outline.png differ diff --git a/content/english/hpc/algorithms/img/prefix-prefetch.svg b/content/english/hpc/algorithms/img/prefix-prefetch.svg new file mode 100644 index 00000000..0fca7d8f --- /dev/null +++ b/content/english/hpc/algorithms/img/prefix-prefetch.svg @@ -0,0 +1,1568 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/prefix-scalar.svg b/content/english/hpc/algorithms/img/prefix-scalar.svg new file mode 100644 index 00000000..d5186d9f --- /dev/null +++ b/content/english/hpc/algorithms/img/prefix-scalar.svg @@ -0,0 +1,1184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/prefix-simd.svg b/content/english/hpc/algorithms/img/prefix-simd.svg new file mode 100644 index 00000000..16bb82ac --- /dev/null +++ b/content/english/hpc/algorithms/img/prefix-simd.svg @@ -0,0 +1,1262 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/algorithms/img/rho.jpg b/content/english/hpc/algorithms/img/rho.jpg new file mode 100644 index 00000000..d7f01ad8 Binary files /dev/null and b/content/english/hpc/algorithms/img/rho.jpg differ diff --git a/content/english/hpc/algorithms/matmul.md b/content/english/hpc/algorithms/matmul.md index be5bd07d..cf976045 100644 --- a/content/english/hpc/algorithms/matmul.md +++ b/content/english/hpc/algorithms/matmul.md @@ -1,426 +1,485 @@ --- title: Matrix Multiplication -weight: 4 -draft: true +weight: 20 --- + -## Case Study: Distance Product +In this case study, we will design and implement several algorithms for matrix multiplication. -(We are going to speedrun "[Programming Parallel Computers](http://ppc.cs.aalto.fi/ch2/)" course) +We start with the naive "for-for-for" algorithm and incrementally improve it, eventually arriving at a version that is 50 times faster and matches the performance of BLAS libraries while being under 40 lines of C. -Given a matrix $D$, we need to calculate its "min-plus matrix multiplication" defined as: +All implementations are compiled with GCC 13 and run on a [Zen 2](https://en.wikichip.org/wiki/amd/microarchitectures/zen_2) CPU clocked at 2GHz. -$(D \circ D)_{ij} = \min_k(D_{ik} + D_{kj})$ +## Baseline ----- +The result of multiplying an $l \times n$ matrix $A$ by an $n \times m$ matrix $B$ is defined as an $l \times m$ matrix $C$ such that: -Graph interpretation: -find shortest paths of length 2 between all vertices in a fully-connected weighted graph +$$ +C_{ij} = \sum_{k=1}^{n} A_{ik} \cdot B_{kj} +$$ -![](https://i.imgur.com/Zf4G7qj.png) +For simplicity, we will only consider *square* matrices, where $l = m = n$. ----- +To implement matrix multiplication, we can simply transfer this definition into code, but instead of two-dimensional arrays (aka matrices), we will be using one-dimensional arrays to be explicit about pointer arithmetic: -A cool thing about distance product is that if if we iterate the process and calculate: +```c++ +void matmul(const float *a, const float *b, float *c, int n) { + for (int i = 0; i < n; i++) + for (int j = 0; j < n; j++) + for (int k = 0; k < n; k++) + c[i * n + j] += a[i * n + k] * b[k * n + j]; +} +``` -$D_2 = D \circ D, \;\; -D_4 = D_2 \circ D_2, \;\; -D_8 = D_4 \circ D_4, \;\; -\ldots$ +For reasons that will become apparent later, we will only use matrix sizes that are multiples of $48$ for benchmarking, but the implementations remain correct for all others. We also use [32-bit floats](/hpc/arithmetic/ieee-754) specifically, although all implementations can be easily [generalized](#generalizations) to other data types and operations. -Then we can find all-pairs shortest distances in $O(\log n)$ steps +Compiled with `g++ -O3 -march=native -ffast-math -funroll-loops`, the naive approach multiplies two matrices of size $n = 1920 = 48 \times 40$ in ~16.7 seconds. To put it in perspective, this is approximately $\frac{1920^3}{16.7 \times 10^9} \approx 0.42$ useful operations per nanosecond (GFLOPS), or roughly 5 CPU cycles per multiplication, which doesn't look that good yet. -(but recall that there are [more direct ways](https://en.wikipedia.org/wiki/Floyd%E2%80%93Warshall_algorithm) to solve it) +## Transposition ---- +In general, when optimizing an algorithm that processes large quantities of data — and $1920^2 \times 3 \times 4 \approx 42$ MB clearly is a large quantity as it can't fit into any of the [CPU caches](/hpc/cpu-cache) — one should always start with memory before optimizing arithmetic, as it is much more likely to be the bottleneck. -## V0: Baseline +The field $C_{ij}$ can be thought of as the dot product of row $i$ of matrix $A$ and column $j$ of matrix $B$. As we increment `k` in the inner loop above, we are reading the matrix `a` sequentially, but we are jumping over $n$ elements as we iterate over a column of `b`, which is [not as fast](/hpc/cpu-cache/aos-soa) as sequential iteration. -Implement the definition of what we need to do, but using arrays instead of matrices: +One [well-known](/hpc/external-memory/oblivious/#matrix-multiplication) optimization that tackles this problem is to store matrix $B$ in *column-major* order — or, alternatively, to *transpose* it before the matrix multiplication. This requires $O(n^2)$ additional operations but ensures sequential reads in the innermost loop: -```cpp -const float infty = std::numeric_limits::infinity(); + + +```c++ +void matmul(const float *a, const float *_b, float *c, int n) { + float *b = new float[n * n]; + + for (int i = 0; i < n; i++) + for (int j = 0; j < n; j++) + b[i * n + j] = _b[j * n + i]; + + for (int i = 0; i < n; i++) + for (int j = 0; j < n; j++) + for (int k = 0; k < n; k++) + c[i * n + j] += a[i * n + k] * b[j * n + k]; // <- note the indices } ``` -Compile with `g++ -O3 -march=native -std=c++17` +This code runs in ~12.4s, or about 30% faster. -On our Intel Core i5-6500 ("Skylake", 4 cores, 3.6 GHz) with $n=4000$ it runs for 99s, -which amounts to ~1.3B useful floating point operations per second +As we will see in a bit, there are more important benefits to transposing it than just the sequential memory reads. ---- +## Vectorization -## Theoretical Performance +Now that all we do is just sequentially read the elements of `a` and `b`, multiply them, and add the result to an accumulator variable, we can use [SIMD](/hpc/simd/) instructions to speed it all up. It is pretty straightforward to implement using [GCC vector types](/hpc/simd/intrinsics/#gcc-vector-extensions) — we can [memory-align](/hpc/cpu-cache/alignment/) matrix rows, pad them with zeros, and then compute the multiply-sum as we would normally compute any other [reduction](/hpc/simd/reduction/): -$$ -\underbrace{4}_{CPUs} \cdot \underbrace{8}_{SIMD} \cdot \underbrace{2}_{1/thr} \cdot \underbrace{3.6 \cdot 10^9}_{cycles/sec} = 230.4 \; GFLOPS \;\; (2.3 \cdot 10^{11}) -$$ +```c++ +// a vector of 256 / 32 = 8 floats +typedef float vec __attribute__ (( vector_size(32) )); -RAM bandwidth: 34.1 GB/s (or ~10 bytes per cycle) - +// a helper function that allocates n vectors and initializes them with zeros +vec* alloc(int n) { + vec* ptr = (vec*) std::aligned_alloc(32, 32 * n); + memset(ptr, 0, 32 * n); + return ptr; +} ---- +void matmul(const float *_a, const float *_b, float *c, int n) { + int nB = (n + 7) / 8; // number of 8-element vectors in a row (rounded up) + + vec *a = alloc(n * nB); + vec *b = alloc(n * nB); + + // move both matrices to the aligned region + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + a[i * nB + j / 8][j % 8] = _a[i * n + j]; + b[i * nB + j / 8][j % 8] = _b[j * n + i]; // <- b is still transposed + } + } + + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + vec s{}; // initialize the accumulator with zeros + + // vertical summation + for (int k = 0; k < nB; k++) + s += a[i * nB + k] * b[j * nB + k]; + + // horizontal summation + for (int k = 0; k < 8; k++) + c[i * n + j] += s[k]; + } + } -## OpenMP + std::free(a); + std::free(b); +} +``` -* We have 4 cores, so why don't we use them? -* There are low-level ways of creating threads, but they involve a lot of code -* We will use a high-level interface called OpenMP -* (We will talk about multithreading in much more detail on the next lecture) +The performance for $n = 1920$ is now around 2.3 GFLOPS — or another ~4 times higher compared to the transposed but not vectorized version. -![](https://www.researchgate.net/profile/Mario_Storti/publication/231168223/figure/fig2/AS:393334787985424@1470789729707/The-master-thread-creates-a-team-of-parallel-threads.png =400x) +![](../img/mm-vectorized-barplot.svg) ----- +This optimization looks neither too complex nor specific to matrix multiplication. Why can't the compiler [auto-vectorizee](/hpc/simd/auto-vectorization/) the inner loop by itself? -## Multithreading Made Easy +It actually can; the only thing preventing that is the possibility that `c` overlaps with either `a` or `b`. To rule it out, you can communicate to the compiler that you guarantee `c` is not [aliased](/hpc/compilation/contracts/#memory-aliasing) with anything by adding the `__restrict__` keyword to it: -All you need to know for now is the `#pragma omp parallel for` directive + -```cpp -#pragma omp parallel for -for (int i = 0; i < 10; ++i) { - do_stuff(i); +```c++ +void matmul(const float *a, const float *_b, float * __restrict__ c, int n) { + // ... } ``` -It splits iterations of a loop among multiple threads +Both manually and auto-vectorized implementations perform roughly the same. -There are many ways to control scheduling, -but we'll just leave defaults because our use case is simple - + -## Warning: Data Races +## Memory efficiency -This only works when all iterations can safely be executed simultaneously -It's not always easy to determine, but for now following rules of thumb are enough: +What is interesting is that the implementation efficiency depends on the problem size. -* There must not be any shared data element that is read by X and written by Y -* There must not be any shared data element that is written by X and written by Y +At first, the performance (defined as the number of useful operations per second) increases as the overhead of the loop management and the horizontal reduction decreases. Then, at around $n=256$, it starts smoothly decreasing as the matrices stop fitting into the [cache](/hpc/cpu-cache/) ($2 \times 256^2 \times 4 = 512$ KB is the size of the L2 cache), and the performance becomes bottlenecked by the [memory bandwidth](/hpc/cpu-cache/bandwidth/). -E. g. sum can't be parallelized this way, as threads would modify a shared variable - +![](../img/mm-vectorized-plot.svg) ---- +It is also interesting that the naive implementation is mostly on par with the non-vectorized transposed version — and even slightly better because it doesn't need to perform a transposition. -## Parallel Baseline - -OpenMP is included in compilers: just add `-fopenmp` flag and that's it - -```cpp -void step(float* r, const float* d, int n) { - #pragma omp parallel for - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - float v = infty; - for (int k = 0; k < n; ++k) { - float x = d[n*i + k]; - float y = d[n*k + j]; - float z = x + y; - v = std::min(v, z); - } - r[n*i + j] = v; - } - } -} -``` +One might think that there would be some general performance gain from doing sequential reads since we are fetching fewer cache lines, but this is not the case: fetching the first column of `b` indeed takes more time, but the next 15 column reads will be in the same cache lines as the first one, so they will be cached anyway — unless the matrix is so large that it can't even fit `n * cache_line_size` bytes into the cache, which is not the case for any practical matrix sizes. -Runs ~4x times faster, as it should +Instead, the performance deteriorates on only a few specific matrix sizes due to the effects of [cache associativity](/hpc/cpu-cache/associativity/): when $n$ is a multiple of a large power of two, we are fetching the addresses of `b` that all likely map to the same cache line, which reduces the effective cache size. This explains the 30% performance dip for $n = 1920 = 2^7 \times 3 \times 5$, and you can see an even more noticeable one for $1536 = 2^9 \times 3$: it is roughly 3 times slower than for $n=1535$. ---- +So, counterintuitively, transposing the matrix doesn't help with caching — and in the naive scalar implementation, we are not really bottlenecked by the memory bandwidth anyway. But our vectorized implementation certainly is, so let's work on its I/O efficiency. -## Memory Bottleneck +## Register reuse -![](https://i.imgur.com/z4d6aez.png =450x) +Using a Python-like notation to refer to submatrices, to compute the cell $C[x][y]$, we need to calculate the dot product of $A[x][:]$ and $B[:][y]$, which requires fetching $2n$ elements, even if we store $B$ in column-major order. -(It is slower on macOS because of smaller page sizes) + ----- +To compute $C[x:x+2][y:y+2]$, a $2 \times 2$ submatrix of $C$, we would need two rows from $A$ and two columns from $B$, namely $A[x:x+2][:]$ and $B[:][y:y+2]$, containing $4n$ elements in total, to update *four* elements instead of *one* — which is $\frac{2n / 1}{4n / 4} = 2$ times better in terms of I/O efficiency. -## Virtual Memory + -## V1: Linear Reading +To avoid fetching data more than once, we need to iterate over these rows and columns in parallel and calculate all $2 \times 2$ possible combinations of products. Here is a proof of concept: -Just transpose it, as we did with matrices +```c++ +void kernel_2x2(int x, int y) { + int c00 = 0, c01 = 0, c10 = 0, c11 = 0; -```cpp -void step(float* r, const float* d, int n) { - std::vector t(n*n); - #pragma omp parallel for - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - t[n*j + i] = d[n*i + j]; - } - } + for (int k = 0; k < n; k++) { + // read rows + int a0 = a[x][k]; + int a1 = a[x + 1][k]; - #pragma omp parallel for - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - float v = std::numeric_limits::infinity(); - for (int k = 0; k < n; ++k) { - float x = d[n*i + k]; - float y = t[n*j + k]; - float z = x + y; - v = std::min(v, z); - } - r[n*i + j] = v; - } + // read columns + int b0 = b[k][y]; + int b1 = b[k][y + 1]; + + // update all combinations + c00 += a0 * b0; + c01 += a0 * b1; + c10 += a1 * b0; + c11 += a1 * b1; } + + // write the results to C + c[x][y] = c00; + c[x][y + 1] = c01; + c[x + 1][y] = c10; + c[x + 1][y + 1] = c11; } ``` ----- +We can now simply call this kernel on all 2x2 submatrices of $C$, but we won't bother evaluating it: although this algorithm is better in terms of I/O operations, it would still not beat our SIMD-based implementation. Instead, we will extend this approach and develop a similar *vectorized* kernel right away. -![](https://i.imgur.com/UwxcEG7.png =600x) + ---- +## Designing the kernel -## V2: Instruction-Level Parallelism +Instead of designing a kernel that computes an $h \times w$ submatrix of $C$ from scratch, we will declare a function that *updates* it using columns from $l$ to $r$ of $A$ and rows from $l$ to $r$ of $B$. For now, this seems like an over-generalization, but this function interface will prove useful later. -We can apply the same trick as we did with array sum earlier, so that instead of: + -```cpp -v0 = min(v0, z0); -v1 = min(v1, z1); -v0 = min(v0, z2); -v1 = min(v1, z3); -v0 = min(v0, z4); -... -v = min(v0, v1); -``` +To determine $h$ and $w$, we have several performance considerations: ----- +- In general, to compute an $h \times w$ submatrix, we need to fetch $2 \cdot n \cdot (h + w)$ elements. To optimize the I/O efficiency, we want the $\frac{h \cdot w}{h + w}$ ratio to be high, which is achieved with large and square-ish submatrices. +- We want to use the [FMA](https://en.wikipedia.org/wiki/FMA_instruction_set) ("fused multiply-add") instruction available on all modern x86 architectures. As you can guess from the name, it performs the `c += a * b` operation — which is the core of a dot product — on 8-element vectors in one go, which saves us from executing vector multiplication and addition separately. +- To achieve better utilization of this instruction, we want to make use of [instruction-level parallelism](/hpc/pipelining/). On Zen 2, the `fma` instruction has a latency of 5 and a throughput of 2, meaning that we need to concurrently execute at least $5 \times 2 = 10$ of them to saturate its execution ports. +- We want to avoid register spill (move data to and from registers more than necessary), and we only have $16$ logical vector registers that we can use as accumulators (minus those that we need to hold temporary values). -![](https://i.imgur.com/ihMC6z2.png) +For these reasons, we settle on a $6 \times 16$ kernel. This way, we process $96$ elements at once that are stored in $6 \times 2 = 12$ vector registers. To update them efficiently, we use the following procedure: -Our memory layout looks like this now + - #pragma omp parallel for - for (int j = 0; j < n; ++j) { - for (int i = 0; i < n; ++i) { - d[nab*j + i] = d_[n*j + i]; - t[nab*j + i] = d_[n*i + j]; - } - } +```c++ +// update 6x16 submatrix C[x:x+6][y:y+16] +// using A[x:x+6][l:r] and B[l:r][y:y+16] +void kernel(float *a, vec *b, vec *c, int x, int y, int l, int r, int n) { + vec t[6][2]{}; // will be zero-filled and stored in ymm registers - #pragma omp parallel for - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - // vv[0] = result for k = 0, 4, 8, ... - // vv[1] = result for k = 1, 5, 9, ... - // vv[2] = result for k = 2, 6, 10, ... - // vv[3] = result for k = 3, 7, 11, ... - float vv[nb]; - for (int kb = 0; kb < nb; ++kb) { - vv[kb] = infty; - } - for (int ka = 0; ka < na; ++ka) { - for (int kb = 0; kb < nb; ++kb) { - float x = d[nab*i + ka * nb + kb]; - float y = t[nab*j + ka * nb + kb]; - float z = x + y; - vv[kb] = std::min(vv[kb], z); - } - } - // v = result for k = 0, 1, 2, ... - float v = infty; - for (int kb = 0; kb < nb; ++kb) { - v = std::min(vv[kb], v); - } - r[n*i + j] = v; + for (int k = l; k < r; k++) { + for (int i = 0; i < 6; i++) { + // broadcast a[x + i][k] into a register + vec alpha = vec{} + a[(x + i) * n + k]; // converts to a broadcast + // multiply b[k][y:y+16] by it and update t[i][0] and t[i][1] + for (int j = 0; j < 2; j++) + t[i][j] += alpha * b[(k * n + y) / 8 + j]; // converts to an fma } } + + // write the results back to C + for (int i = 0; i < 6; i++) + for (int j = 0; j < 2; j++) + c[((x + i) * n + y) / 8 + j] += t[i][j]; } ``` ----- +We need `t` so that the compiler stores these elements in vector registers. We could just update their final destinations in `c`, but, unfortunately, the compiler re-writes them back to memory, causing a slowdown (wrapping everything in `__restrict__` keywords doesn't help). -![](https://i.imgur.com/5uHVRL4.png =600x) +After unrolling these loops and hoisting `b` out of the `i` loop (`b[(k * n + y) / 8 + j]` does not depend on `i` and can be loaded once and reused in all 6 iterations), the compiler generates something more similar to this: ---- - -## V3: Vectorization + -![](https://i.imgur.com/EG0WjHl.png =400x) +```c++ +for (int k = l; k < r; k++) { + __m256 b0 = _mm256_load_ps((__m256*) &b[k * n + y]; + __m256 b1 = _mm256_load_ps((__m256*) &b[k * n + y + 8]; + + __m256 a0 = _mm256_broadcast_ps((__m128*) &a[x * n + k]); + t00 = _mm256_fmadd_ps(a0, b0, t00); + t01 = _mm256_fmadd_ps(a0, b1, t01); ----- + __m256 a1 = _mm256_broadcast_ps((__m128*) &a[(x + 1) * n + k]); + t10 = _mm256_fmadd_ps(a1, b0, t10); + t11 = _mm256_fmadd_ps(a1, b1, t11); -```cpp -static inline float8_t min8(float8_t x, float8_t y) { - return x < y ? x : y; + // ... } +``` -void step(float* r, const float* d_, int n) { - // elements per vector - constexpr int nb = 8; - // vectors per input row - int na = (n + nb - 1) / nb; - - // input data, padded, converted to vectors - float8_t* vd = float8_alloc(n*na); - // input data, transposed, padded, converted to vectors - float8_t* vt = float8_alloc(n*na); - - #pragma omp parallel for - for (int j = 0; j < n; ++j) { - for (int ka = 0; ka < na; ++ka) { - for (int kb = 0; kb < nb; ++kb) { - int i = ka * nb + kb; - vd[na*j + ka][kb] = i < n ? d_[n*j + i] : infty; - vt[na*j + ka][kb] = i < n ? d_[n*i + j] : infty; - } - } - } +We are using $12+3=15$ vector registers and a total of $6 \times 3 + 2 = 20$ instructions to perform $16 \times 6 = 96$ updates. Assuming that there are no other bottleneks, we should be hitting the throughput of `_mm256_fmadd_ps`. - #pragma omp parallel for - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { - float8_t vv = f8infty; - for (int ka = 0; ka < na; ++ka) { - float8_t x = vd[na*i + ka]; - float8_t y = vt[na*j + ka]; - float8_t z = x + y; - vv = min8(vv, z); - } - r[n*i + j] = hmin8(vv); - } +Note that this kernel is architecture-specific. If we didn't have `fma`, or if its throughput/latency were different, or if the SIMD width was 128 or 512 bits, we would have made different design choices. Multi-platform BLAS implementations ship [many kernels](https://github.com/xianyi/OpenBLAS/tree/develop/kernel), each written in assembly by hand and optimized for a particular architecture. + +The rest of the implementation is straightforward. Similar to the previous vectorized implementation, we just move the matrices to memory-aligned arrays and call the kernel instead of the innermost loop: + +```c++ +void matmul(const float *_a, const float *_b, float *_c, int n) { + // to simplify the implementation, we pad the height and width + // so that they are divisible by 6 and 16 respectively + int nx = (n + 5) / 6 * 6; + int ny = (n + 15) / 16 * 16; + + float *a = alloc(nx * ny); + float *b = alloc(nx * ny); + float *c = alloc(nx * ny); + + for (int i = 0; i < n; i++) { + memcpy(&a[i * ny], &_a[i * n], 4 * n); + memcpy(&b[i * ny], &_b[i * n], 4 * n); // we don't need to transpose b this time } - std::free(vt); - std::free(vd); + for (int x = 0; x < nx; x += 6) + for (int y = 0; y < ny; y += 16) + kernel(a, (vec*) b, (vec*) c, x, y, 0, n, ny); + + for (int i = 0; i < n; i++) + memcpy(&_c[i * n], &c[i * ny], 4 * n); + + std::free(a); + std::free(b); + std::free(c); } ``` ----- +This improves the benchmark performance, but only by ~40%: -![](https://i.imgur.com/R3OvLKO.png =600x) +![](../img/mm-kernel-barplot.svg) ---- +The speedup is much higher (2-3x) on smaller arrays, indicating that there is still a memory bandwidth problem: -## V4: Register Reuse - -* At this point we are actually bottlenecked by memory -* It turns out that calculating one $r_{ij}$ at a time is not optimal -* We can reuse data that we read into registers to update other fields - ----- - -![](https://i.imgur.com/ljvD0ba.png =400x) - ----- - -```cpp -for (int ka = 0; ka < na; ++ka) { - float8_t y0 = vt[na*(jc * nd + 0) + ka]; - float8_t y1 = vt[na*(jc * nd + 1) + ka]; - float8_t y2 = vt[na*(jc * nd + 2) + ka]; - float8_t x0 = vd[na*(ic * nd + 0) + ka]; - float8_t x1 = vd[na*(ic * nd + 1) + ka]; - float8_t x2 = vd[na*(ic * nd + 2) + ka]; - vv[0][0] = min8(vv[0][0], x0 + y0); - vv[0][1] = min8(vv[0][1], x0 + y1); - vv[0][2] = min8(vv[0][2], x0 + y2); - vv[1][0] = min8(vv[1][0], x1 + y0); - vv[1][1] = min8(vv[1][1], x1 + y1); - vv[1][2] = min8(vv[1][2], x1 + y2); - vv[2][0] = min8(vv[2][0], x2 + y0); - vv[2][1] = min8(vv[2][1], x2 + y1); - vv[2][2] = min8(vv[2][2], x2 + y2); -} +![](../img/mm-kernel-plot.svg) + +Now, if you've read the section on [cache-oblivious algorithms](/hpc/external-memory/oblivious/), you know that one universal solution to these types of things is to split all matrices into four parts, perform eight recursive block matrix multiplications, and carefully combine the results together. This solution is okay in practice, but there is some [overhead to recursion](/hpc/architecture/functions/), and it also doesn't allow us to fine-tune the algorithm, so instead, we will follow a different, simpler approach. + +## Blocking + +The *cache-aware* alternative to the divide-and-conquer trick is *cache blocking*: splitting the data into blocks that can fit into the cache and processing them one by one. If we have more than one layer of cache, we can do hierarchical blocking: we first select a block of data that fits into the L3 cache, then we split it into blocks that fit into the L2 cache, and so on. This approach requires knowing the cache sizes in advance, but it is usually easier to implement and also faster in practice. + +Cache blocking is less trivial to do with matrices than with arrays, but the general idea is this: + +- Select a submatrix of $B$ that fits into the L3 cache (say, a subset of its columns). +- Select a submatrix of $A$ that fits into the L2 cache (say, a subset of its rows). +- Select a submatrix of the previously selected submatrix of $B$ (a subset of its rows) that fits into the L1 cache. +- Update the relevant submatrix of $C$ using the kernel. + +Here is a good [visualization](https://jukkasuomela.fi/cache-blocking-demo/) by Jukka Suomela (it features many different approaches; you are interested in the last one). + +Note that the decision to start this process with matrix $B$ is not arbitrary. During the kernel execution, we are reading the elements of $A$ much slower than the elements of $B$: we fetch and broadcast just one element of $A$ and then multiply it with $16$ elements of $B$. Therefore, we want $B$ to be in the L1 cache while $A$ can stay in the L2 cache and not the other way around. + +This sounds complicated, but we can implement it with just three more outer `for` loops, which are collectively called *macro-kernel* (and the highly optimized low-level function that updates a 6x16 submatrix is called *micro-kernel*): + +```c++ +const int s3 = 64; // how many columns of B to select +const int s2 = 120; // how many rows of A to select +const int s1 = 240; // how many rows of B to select + +for (int i3 = 0; i3 < ny; i3 += s3) + // now we are working with b[:][i3:i3+s3] + for (int i2 = 0; i2 < nx; i2 += s2) + // now we are working with a[i2:i2+s2][:] + for (int i1 = 0; i1 < ny; i1 += s1) + // now we are working with b[i1:i1+s1][i3:i3+s3] + // and we need to update c[i2:i2+s2][i3:i3+s3] with [l:r] = [i1:i1+s1] + for (int x = i2; x < std::min(i2 + s2, nx); x += 6) + for (int y = i3; y < std::min(i3 + s3, ny); y += 16) + kernel(a, (vec*) b, (vec*) c, x, y, i1, std::min(i1 + s1, n), ny); ``` -Ugly, but worth it +Cache blocking completely removes the memory bottleneck: ----- +![](../img/mm-blocked-barplot.svg) -![](https://i.imgur.com/GZvIt8J.png =600x) +The performance is no longer (significantly) affected by the problem size: ---- +![](../img/mm-blocked-plot.svg) -## V5: More Register Reuse +Notice that the dip at $1536$ is still there: cache associativity still affects the performance. To mitigate this, we can adjust the step constants or insert holes into the layout, but we will not bother doing that for now. -![](https://i.imgur.com/amUznoQ.png =400x) +## Optimization ----- +To approach closer to the performance limit, we need a few more optimizations: -![](https://i.imgur.com/24nBJ1Y.png =600x) +- Remove memory allocation and operate directly on the arrays that are passed to the function. Note that we don't need to do anything with `a` as we are reading just one element at a time, and we can use an [unaligned](/hpc/simd/moving/#aligned-loads-and-stores) `store` for `c` as we only use it rarely, so our only concern is reading `b`. +- Get rid of the `std::min` so that the size parameters are (mostly) constant and can be embedded into the machine code by the compiler (which also lets it [unroll](/hpc/architecture/loops/) the micro-kernel loop more efficiently and avoid runtime checks). +- Rewrite the micro-kernel by hand using 12 vector variables (the compiler seems to struggle with keeping them in registers and writes them first to a temporary memory location and only then to $C$). ---- +These optimizations are straightforward but quite tedious to implement, so we are not going to list [the code](https://github.com/sslotin/amh-code/blob/main/matmul/v5-unrolled.cc) here in the article. It also requires some more work to effectively support "weird" matrix sizes, which is why we only run benchmarks for sizes that are multiple of $48 = \frac{6 \cdot 16}{\gcd(6, 16)}$. -## V6: Software Prefetching + -## V7: Temporal Cache Locality +These individually small improvements compound and result in another 50% improvement: -![](https://i.imgur.com/29vTLKJ.png) +![](../img/mm-noalloc.svg) ----- +We are actually not that far from the theoretical performance limit — which can be calculated as the SIMD width times the `fma` instruction throughput times the clock frequency: -### Z-Curve +$$ +\underbrace{8}_{SIMD} \cdot \underbrace{2}_{thr.} \cdot \underbrace{2 \cdot 10^9}_{cycles/sec} = 32 \; GFLOPS \;\; (3.2 \cdot 10^{10}) +$$ -![](https://i.imgur.com/0optLZ3.png) +It is more representative to compare against some practical library, such as [OpenBLAS](https://www.openblas.net/). The laziest way to do it is to simply [invoke matrix multiplication from NumPy](/hpc/complexity/languages/#blas). There may be some minor overhead due to Python, but it ends up reaching 80% of the theoretical limit, which seems plausible (a 20% overhead is okay: matrix multiplication is not the only thing that CPUs are made for). ----- +![](../img/mm-blas.svg) -![](https://i.imgur.com/U3GaO5b.png) +We've reached ~93% of BLAS performance and ~75% of the theoretical performance limit, which is really great for what is essentially just 40 lines of C. ---- +Interestingly, the whole thing can be rolled into just one deeply nested `for` loop with a BLAS level of performance (assuming that we're in 2050 and using GCC version 35, which finally stopped screwing up with register spilling): + +```c++ +for (int i3 = 0; i3 < n; i3 += s3) + for (int i2 = 0; i2 < n; i2 += s2) + for (int i1 = 0; i1 < n; i1 += s1) + for (int x = i2; x < i2 + s2; x += 6) + for (int y = i3; y < i3 + s3; y += 16) + for (int k = i1; k < i1 + s1; k++) + for (int i = 0; i < 6; i++) + for (int j = 0; j < 2; j++) + c[x * n / 8 + i * n / 8 + y / 8 + j] + += (vec{} + a[x * n + i * n + k]) + * b[n / 8 * k + y / 8 + j]; +``` + +There is also an approach that performs asymptotically fewer arithmetic operations — [the Strassen algorithm](/hpc/external-memory/oblivious/#strassen-algorithm) — but it has a large constant factor, and it is only efficient for [very large matrices](https://arxiv.org/pdf/1605.01078.pdf) ($n > 4000$), where we typically have to use either multiprocessing or some approximate dimensionality-reducing methods anyway. + +## Generalizations + +FMA also supports 64-bit floating-point numbers, but it does not support integers: you need to perform addition and multiplication separately, which results in decreased performance. If you can guarantee that all intermediate results can be represented exactly as 32- or 64-bit floating-point numbers (which is [often the case](/hpc/arithmetic/errors/)), it may be faster to just convert them to and from floats. + +This approach can be also applied to some similar-looking computations. One example is the "min-plus matrix multiplication" defined as: + +$$ +(A \circ B)_{ij} = \min_{1 \le k \le n} (A_{ik} + B_{kj}) +$$ + +It is also known as the "distance product" due to its graph interpretation: when applied to itself $(D \circ D)$, the result is the matrix of shortest paths of length two between all pairs of vertices in a fully-connected weighted graph specified by the edge weight matrix $D$. + +A cool thing about the distance product is that if we iterate the process and calculate + +$$ +D_2 = D \circ D \\ +D_4 = D_2 \circ D_2 \\ +D_8 = D_4 \circ D_4 \\ +\ldots +$$ + +…we can find all-pairs shortest paths in $O(\log n)$ steps: + +```c++ +for (int l = 0; l < logn; l++) + for (int i = 0; i < n; i++) + for (int j = 0; j < n; j++) + for (int k = 0; k < n; k++) + d[i][j] = min(d[i][j], d[i][k] + d[k][j]); +``` + +This requires $O(n^3 \log n)$ operations. If we do these two-edge relaxations in a particular order, we can do it with just one pass, which is known as the [Floyd-Warshall algorithm](https://en.wikipedia.org/wiki/Floyd%E2%80%93Warshall_algorithm): + +```c++ +for (int k = 0; k < n; k++) + for (int i = 0; i < n; i++) + for (int j = 0; j < n; j++) + d[i][j] = min(d[i][j], d[i][k] + d[k][j]); +``` + +Interestingly, similarly vectorizing the distance product and executing it $O(\log n)$ times ([or possibly fewer](https://arxiv.org/pdf/1904.01210.pdf)) in $O(n^3 \log n)$ total operations is faster than naively executing the Floyd-Warshall algorithm in $O(n^3)$ operations, although not by a lot. + +As an exercise, try to speed up this "for-for-for" computation. It is harder to do than in the matrix multiplication case because now there is a logical dependency between the iterations, and you need to perform updates in a particular order, but it is still possible to design [a similar kernel and a block iteration order](https://github.com/sslotin/amh-code/blob/main/floyd/blocked.cc) that achieves a 30-50x total speedup. + +## Acknowledgements -## Summary +The final algorithm was originally designed by Kazushige Goto, and it is the basis of GotoBLAS and OpenBLAS. The author himself describes it in more detail in "[Anatomy of High-Performance Matrix Multiplication](https://www.cs.utexas.edu/~flame/pubs/GotoTOMS_revision.pdf)". -* Deal with memory problems first (make sure data fits L3 cache) -* SIMD can get you ~10x speedup -* ILP can get you 2-3x speedup -* Multi-core parallelism can get you $NUM_CORES speedup - (and it can be just one `#pragma omp parallel for` away) +The exposition style is inspired by the "[Programming Parallel Computers](http://ppc.cs.aalto.fi/)" course by Jukka Suomela, which features a [similar case study](http://ppc.cs.aalto.fi/ch2/) on speeding up the distance product. diff --git a/content/english/hpc/algorithms/parsing.md b/content/english/hpc/algorithms/parsing.md deleted file mode 100644 index c189e66a..00000000 --- a/content/english/hpc/algorithms/parsing.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Parsing with SIMD -weight: 5 -draft: true ---- diff --git a/content/english/hpc/algorithms/prefix.md b/content/english/hpc/algorithms/prefix.md index 01294046..43bfd560 100644 --- a/content/english/hpc/algorithms/prefix.md +++ b/content/english/hpc/algorithms/prefix.md @@ -1,6 +1,248 @@ --- -title: Prefix Sum With SIMD -draft: true +title: Prefix Sum with SIMD +weight: 8 --- -... +The *prefix sum*, also known as *cumulative sum*, *inclusive scan*, or simply *scan*, is a sequence of numbers $b_i$ generated from another sequence $a_i$ using the following rule: + +$$ +\begin{aligned} +b_0 &= a_0 +\\ b_1 &= a_0 + a_1 +\\ b_2 &= a_0 + a_1 + a_2 +\\ &\ldots +\end{aligned} +$$ + +In other words, the $k$-th element of the output sequence is the sum of the first $k$ elements of the input sequence. + +Prefix sum is a very important primitive in many algorithms, especially in the context of parallel algorithms, where its computation scales almost perfectly with the number of processors. Unfortunately, it is much harder to speed up with SIMD parallelism on a single CPU core, but we will try it nonetheless — and derive an algorithm that is ~2.5x faster than the baseline scalar implementation. + +### Baseline + +For our baseline, we could just invoke `std::partial_sum` from the STL, but for clarity, we will implement it manually. We create an array of integers and then sequentially add the previous element to the current one: + +```c++ +void prefix(int *a, int n) { + for (int i = 1; i < n; i++) + a[i] += a[i - 1]; +} +``` + +It seems like we need two reads, an add, and a write on each iteration, but of course, the compiler optimizes the extra read away and uses a register as the accumulator: + +```nasm +loop: + add edx, DWORD PTR [rax] + mov DWORD PTR [rax-4], edx + add rax, 4 + cmp rax, rcx + jne loop +``` + +After [unrolling](/hpc/architecture/loops) the loop, just two instructions effectively remain: the fused read-add and the write-back of the result. Theoretically, these should work at 2 GFLOPS (1 element per CPU cycle, by the virtue of [superscalar processing](/hpc/pipelining)), but since the memory system has to constantly [switch](/hpc/cpu-cache/bandwidth#directional-access) between reading and writing, the actual performance is between 1.2 and 1.6 GFLOPS, depending on the array size. + +### Vectorization + +One way to implement a parallel prefix sum algorithm is to split the array into small blocks, independently calculate *local* prefix sums on them, and then do a second pass where we adjust the computed values in each block by adding the sum of all previous elements to them. + +![](../img/prefix-outline.png) + +This allows processing each block in parallel — both during the computation of the local prefix sums and the accumulation phase — so you usually split the array into as many blocks as you have processors. But since we are only allowed to use one CPU core, and [non-sequential memory access](/hpc/simd/moving#non-contiguous-load) in SIMD doesn't work well, we are not going to do that. Instead, we will use a fixed block size equal to the size of a SIMD lane and calculate prefix sums within a register. + +Now, to compute these prefix sums locally, we are going to use another parallel prefix sum method that is generally inefficient (the total work is $O(n \log n)$ instead of linear) but is good enough for the case when the data is already in a SIMD register. The idea is to perform $\log n$ iterations where on $k$-th iteration, we add $a_{i - 2^k}$ to $a_i$ for all applicable $i$: + +```c++ +for (int l = 0; l < logn; l++) + // (atomically and in parallel): + for (int i = (1 << l); i < n; i++) + a[i] += a[i - (1 << l)]; +``` + +We can prove that this algorithm works by induction: if on $k$-th iteration every element $a_i$ is equal to the sum of the $(i - 2^k, i]$ segment of the original array, then after adding $a_{i - 2^k}$ to it, it will be equal to the sum of $(i - 2^{k+1}, i]$. After $O(\log n)$ iterations, the array will turn into its prefix sum. + +To implement it in SIMD, we could use [permutations](/hpc/simd/shuffling) to place $i$-th element against $(i-2^k)$-th, but they are too slow. Instead, we will use the `sll` ("shift lanes left") instruction that does exactly that and also replaces the unmatched elements with zeros: + +```c++ +typedef __m128i v4i; + +v4i prefix(v4i x) { + // x = 1, 2, 3, 4 + x = _mm_add_epi32(x, _mm_slli_si128(x, 4)); + // x = 1, 2, 3, 4 + // + 0, 1, 2, 3 + // = 1, 3, 5, 7 + x = _mm_add_epi32(x, _mm_slli_si128(x, 8)); + // x = 1, 3, 5, 7 + // + 0, 0, 1, 3 + // = 1, 3, 6, 10 + return x; +} +``` + +Unfortunately, the 256-bit version of this instruction performs this byte shift independently within two 128-bit lanes, which is typical to AVX: + +```c++ +typedef __m256i v8i; + +v8i prefix(v8i x) { + // x = 1, 2, 3, 4, 5, 6, 7, 8 + x = _mm256_add_epi32(x, _mm256_slli_si256(x, 4)); + x = _mm256_add_epi32(x, _mm256_slli_si256(x, 8)); + x = _mm256_add_epi32(x, _mm256_slli_si256(x, 16)); // <- this does nothing + // x = 1, 3, 6, 10, 5, 11, 18, 26 + return x; +} +``` + +We still can use it to compute 4-element prefix sums twice as fast, but we'll have to switch to 128-bit SSE when accumulating. Let's write a handy function that computes a local prefix sum end-to-end: + +```c++ +void prefix(int *p) { + v8i x = _mm256_load_si256((v8i*) p); + x = _mm256_add_epi32(x, _mm256_slli_si256(x, 4)); + x = _mm256_add_epi32(x, _mm256_slli_si256(x, 8)); + _mm256_store_si256((v8i*) p, x); +} +``` + +Now, for the accumulate phase, we will create another handy function that similarly takes the pointer to a 4-element block and also the 4-element vector of the previous prefix sum. The job of this function is to add this prefix sum vector to the block and update it so that it can be passed on to the next block (by broadcasting the last element of the block before the addition): + + + +```c++ +v4i accumulate(int *p, v4i s) { + v4i d = (v4i) _mm_broadcast_ss((float*) &p[3]); + v4i x = _mm_load_si128((v4i*) p); + x = _mm_add_epi32(s, x); + _mm_store_si128((v4i*) p, x); + return _mm_add_epi32(s, d); +} +``` + +With `prefix` and `accumulate` implemented, the only thing left is to glue together our two-pass algorithm: + +```c++ +void prefix(int *a, int n) { + for (int i = 0; i < n; i += 8) + prefix(&a[i]); + + v4i s = _mm_setzero_si128(); + + for (int i = 4; i < n; i += 4) + s = accumulate(&a[i], s); +} +``` + +The algorithm already performs slightly more than twice as fast as the scalar implementation but becomes slower for large arrays that fall out of the L3 cache — roughly at half the [two-way RAM bandwidth](/hpc/cpu-cache/bandwidth) as we are reading the entire array twice. + +![](../img/prefix-simd.svg) + +Another interesting data point: if we only execute the `prefix` phase, the performance would be ~8.1 GFLOPS. The `accumulate` phase is slightly slower at ~5.8 GFLOPS. Sanity check: the total performance should be $\frac{1}{ \frac{1}{5.8} + \frac{1}{8.1} } \approx 3.4$. + +### Blocking + +So, we have a memory bandwidth problem for large arrays. We can avoid re-fetching the entire array from RAM if we split it into blocks that fit in the cache and process them separately. All we need to pass to the next block is the sum of the previous ones, so we can design a `local_prefix` function with an interface similar to `accumulate`: + +```c++ +const int B = 4096; // <- ideally should be slightly less or equal to the L1 cache + +v4i local_prefix(int *a, v4i s) { + for (int i = 0; i < B; i += 8) + prefix(&a[i]); + + for (int i = 0; i < B; i += 4) + s = accumulate(&a[i], s); + + return s; +} + +void prefix(int *a, int n) { + v4i s = _mm_setzero_si128(); + for (int i = 0; i < n; i += B) + s = local_prefix(a + i, s); +} +``` + +(We have to make sure that $N$ is a multiple of $B$, but we are going to ignore such implementation details for now.) + +The blocked version performs considerably better, and not just for when the array is in the RAM: + +![](../img/prefix-blocked.svg) + +The speedup in the RAM case compared to the non-blocked implementation is only ~1.5 and not 2. This is because the memory controller is sitting idle while we iterate over the cached block for the second time instead of fetching the next one — the [hardware prefetcher](/hpc/cpu-cache/prefetching) isn't advanced enough to detect this pattern. + +### Continuous Loads + +There are several ways to solve this under-utilization problem. The obvious one is to use [software prefetching](/hpc/cpu-cache/prefetching) to explicitly request the next block while we are still processing the current one. + +It is better to add prefetching to the `accumulate` phase because it is slower and less memory-intensive than `prefix`: + +```c++ +v4i accumulate(int *p, v4i s) { + __builtin_prefetch(p + B); // <-- prefetch the next block + // ... + return s; +} +``` + +The performance slightly decreases for in-cache arrays, but approaches closer to 2 GFLOPS for the in-RAM ones: + +![](../img/prefix-prefetch.svg) + +Another approach is to do *interleaving* of the two phases. Instead of separating and alternating between them in large blocks, we can execute the two phases concurrently, with the `accumulate` phase lagging behind by a fixed number of iterations — similar to the [CPU pipeline](/hpc/pipelining): + +```c++ +const int B = 64; +// ^ small sizes cause pipeline stalls +// large sizes cause cache system inefficiencies + +void prefix(int *a, int n) { + v4i s = _mm_setzero_si128(); + + for (int i = 0; i < B; i += 8) + prefix(&a[i]); + + for (int i = B; i < n; i += 8) { + prefix(&a[i]); + s = accumulate(&a[i - B], s); + s = accumulate(&a[i - B + 4], s); + } + + for (int i = n - B; i < n; i += 4) + s = accumulate(&a[i], s); +} +``` + +This has more benefits: the loop progresses at a constant speed, reducing the pressure on the memory system, and the scheduler sees the instructions of both subroutines, allowing it to be more efficient at assigning instruction to execution ports — sort of like hyper-threading, but in code. + +For these reasons, the performance improves even on small arrays: + +![](../img/prefix-interleaved.svg) + +And finally, it doesn't seem that we are bottlenecked by the [memory read port](/hpc/pipelining/tables/) or the [decode width](/hpc/architecture/layout/#cpu-front-end), so we can add prefetching for free, which improves the performance even more: + +![](../img/prefix-interleaved-prefetch.svg) + +The total speedup we were able to achieve is between $\frac{4.2}{1.5} \approx 2.8$ for small arrays and $\frac{2.1}{1.2} \approx 1.75$ for large arrays. + +The speedup may be higher for lower-precision data compared to the scalar code, as it is pretty much limited to executing one iteration per cycle regardless of the operand size, but it is still sort of "meh" when compared to some [other SIMD-based algorithms](../argmin). This is largely because there isn't a full-register byte shift in AVX that would allow the `accumulate` stage to proceed twice as fast, let alone a dedicated prefix sum instruction. + +### Other Relevant Work + +You can read [this paper from Columbia](http://www.adms-conf.org/2020-camera-ready/ADMS20_05.pdf) that focuses on the multi-core setting and AVX-512 (which [sort of](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=3037,4870,6715,4845,3853,90,7307,5993,2692,6946,6949,5456,6938,5456,1021,3007,514,518,7253,7183,3892,5135,5260,3915,4027,3873,7401,4376,4229,151,2324,2310,2324,591,4075,6130,4875,6385,5259,6385,6250,1395,7253,6452,7492,4669,4669,7253,1039,1029,4669,4707,7253,7242,848,879,848,7251,4275,879,874,849,833,6046,7250,4870,4872,4875,849,849,5144,4875,4787,4787,4787,3016,3018,5227,7359,7335,7392,4787,5259,5230,5230,5223,6438,488,483,6165,6570,6554,289,6792,6554,5230,6385,5260,5259,289,288,3037,3009,590,604,633,5230,5259,6554,6554,5259,6547,6554,3841,5214,5229,5260,5259,7335,5259,519,1029,515,3009,3009,3013,3011,515,6527,652,6527,6554,288&text=_mm512_alignr_epi32&techs=AVX_512) has a fast 512-bit register byte shift) and [this StackOverflow question](https://stackoverflow.com/questions/10587598/simd-prefix-sum-on-intel-cpu) for a more general discussion. + +Most of what I've described in this article was already known. To the best of my knowledge, my contribution here is the interleaving technique, which is responsible for a modest ~20% performance increase. There probably are ways to improve it further, but not by a lot. + +There is also this professor at CMU, [Guy Blelloch](https://www.cs.cmu.edu/~blelloch/), who [advocated](https://www.cs.cmu.edu/~blelloch/papers/sc90.pdf) for a dedicated prefix sum hardware back in the 90s when [vector processors](https://en.wikipedia.org/wiki/Vector_processor) were still a thing. Prefix sums are very important for parallel applications, and the hardware is becoming increasingly more parallel, so maybe, in the future, the CPU manufacturers will revitalize this idea and make prefix sum calculations slightly easier. + + + diff --git a/content/english/hpc/algorithms/reading-integers.md b/content/english/hpc/algorithms/reading-integers.md new file mode 100644 index 00000000..de9da4e9 --- /dev/null +++ b/content/english/hpc/algorithms/reading-integers.md @@ -0,0 +1,59 @@ +--- +title: Reading Decimal Integers +weight: 10 +draft: true +--- + +I wrote a new integer parsing algorithm that is ~35x faster than scanf. + +(No, this is not an April Fools' joke — although it does sound ridiculous.) + +Zen 2 @ 2GHz. The compiler is Clang 13. + +Ridiculous. + +### Iostream + +### Scanf + +### Syncronization + +### Getchar + +### Buffering + +### SIMD + +http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html + + +### Serial + +### Transpose-based approach + +### Instruction-level parallelism + + +### Modifications + +ILP benefits would not be that huge. + +One huge asterisk. We get the integers, and we can even do other parsing algorithms on them. + +1.75 cycles per byte. + +AVX-512 both due to larger SIMD lane size and dedicated operations for filtering. + +It accounts for ~2% of all time, but it can be optimized by using special procedures. Pad buffer with any digits. + +### Future work + +Next time, we will be *writing* integers. + +You can create a string searcing algorithm by computing hashes in rabin-karp algorithm — although it does not seem to be possible to make an *exact* algorithm for that. + +## Acknowledgements + +http://0x80.pl/articles/simd-parsing-int-sequences.html + +https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 diff --git a/content/english/hpc/architecture/_index.md b/content/english/hpc/architecture/_index.md index b4361e36..5b61a175 100644 --- a/content/english/hpc/architecture/_index.md +++ b/content/english/hpc/architecture/_index.md @@ -6,6 +6,6 @@ weight: 2 When I began learning how to optimize programs myself, one big mistake I made was to rely primarily on the empirical approach. Not understanding how computers really worked, I would semi-randomly swap nested loops, rearrange arithmetic, combine branch conditions, inline functions by hand, and follow all sorts of other performance tips I've heard from other people, blindly hoping for improvement. -Unfortunately, this is how most programmers approach optimization. Most texts about performance do not teach you to reason about software performance qualitatively. Instead they give you general advices about certain implementation approaches — and general performance intuition is clearly not enough. +Unfortunately, this is how most programmers approach optimization. Most texts about performance do not teach you to reason about software performance qualitatively. Instead they give you general advice about certain implementation approaches — and general performance intuition is clearly not enough. It would have probably saved me dozens, if not hundreds of hours if I learned computer architecture *before* doing algorithmic programming. So, even if most people aren't *excited* about it, we are going to spend the first few chapters studying how CPUs work and start with learning assembly. diff --git a/content/english/hpc/architecture/assembly.md b/content/english/hpc/architecture/assembly.md index 9a90c001..de94e4cf 100644 --- a/content/english/hpc/architecture/assembly.md +++ b/content/english/hpc/architecture/assembly.md @@ -1,6 +1,7 @@ --- title: Assembly Language weight: 1 +published: true --- CPUs are controlled with *machine language*, which is just a stream of binary-encoded instructions that specify @@ -14,11 +15,11 @@ A much more human-friendly rendition of machine language, called *assembly langu Jumping right into it, here is how you add two numbers (`*c = *a + *b`) in Arm assembly: ```nasm -; *a = x0, *b = x2, *c = x2 +; *a = x0, *b = x1, *c = x2 ldr w0, [x0] ; load 4 bytes from wherever x0 points into w0 ldr w1, [x1] ; load 4 bytes from wherever x1 points into w1 add w0, w0, w1 ; add w0 with w1 and save the result to w0 -str w0, [x2] ; write contents of w0 to wherever x2 points/ +str w0, [x2] ; write contents of w0 to wherever x2 points ``` Here is the same operation in x86 assembly: @@ -32,15 +33,15 @@ mov DWORD PTR [rdx], eax ; write contents of eax to wherever rdx points Assembly is very simple in the sense that it doesn't have many syntactical constructions compared to high-level programming languages. From what you can observe from the examples above: -- A program is a sequence of instructions, each written as its name followed by a variable amount of operands. +- A program is a sequence of instructions, each written as its name followed by a variable number of operands. - The `[reg]` syntax is used for "dereferencing" a pointer stored in a register, and on x86 you need to prefix it with size information (`DWORD` here means 32 bit). - The `;` sign is used for line comments, similar to `#` and `//` in other languages. -Assembly a very minimal language because it needs to be. It reflects the machine language as closely as possible, up to the point where there is almost 1:1 correspondence between machine code and assembly. In fact, you can turn any compiled program back into its assembly form using a process called *disassembly*[^disassembly] — although everything non-essential like comments will not be preserved. +Assembly is a very minimal language because it needs to be. It reflects the machine language as closely as possible, up to the point where there is almost 1:1 correspondence between machine code and assembly. In fact, you can turn any compiled program back into its assembly form using a process called *disassembly*[^disassembly] — although everything non-essential like comments will not be preserved. [^disassembly]: On Linux, to disassemble a compiled program, you can call `objdump -d {path-to-binary}`. -Note that the two snippets above are not just syntactically different. Both are optimized codes produced by a compiler, but the Arm version uses 4 instruction, while the x86 version uses 3. The `add eax, [rdi]` instruction is what's called *fused instruction* that does a load and an add in one go — this is one of the perks that the [CISC](../isa#risc-vs-cisc) approach can provide. +Note that the two snippets above are not just syntactically different. Both are optimized codes produced by a compiler, but the Arm version uses 4 instructions, while the x86 version uses 3. The `add eax, [rdi]` instruction is what's called *fused instruction* that does a load and an add in one go — this is one of the perks that the [CISC](../isa#risc-vs-cisc) approach can provide. Since there are far more differences between the architectures than just this one, from here on and for the rest of the book we will only provide examples for x86, which is probably what most of our readers will optimize for, although many of the introduced concepts will be architecture-agnostic. @@ -48,23 +49,23 @@ Since there are far more differences between the architectures than just this on For historical reasons, instruction mnemonics in most assembly languages are very terse. Back when people used to write assembly by hand and repeatedly wrote the same set of common instructions, one less character to type was one step away from insanity. -For example, `mov` is for "store/load a word", `inc` is for "increment by 1", `mul` for is "multiply", and `idiv` is for "integer division". You can look up the description of an instruction by its name in [one of x86 references](https://www.felixcloutier.com/x86/), but most instructions do what you'd think they do. +For example, `mov` is for "store/load a word," `inc` is for "increment by 1," `mul` is for "multiply," and `idiv` is for "integer division." You can look up the description of an instruction by its name in [one of x86 references](https://www.felixcloutier.com/x86/), but most instructions do what you'd think they do. Most instructions write their result into the first operand, which can also be involved in the computation like in the `add eax, [rdi]` example we saw before. Operands can be either registers, constant values, or memory locations. -**Registers** are named `rax`, `rbx`, `rcx`, `rdx`, `rdi`, `rsi`, `rbp`, `rsp`, and `r8`-`r15` for a total of 16 of them. The "letter" ones are named like that for historical reasons: `rax` is "accumulator", `rcx` is "counter", `rdx` is "data" and so on, but, of course, they don't have to be used only for that. +**Registers** are named `rax`, `rbx`, `rcx`, `rdx`, `rdi`, `rsi`, `rbp`, `rsp`, and `r8`-`r15` for a total of 16 of them. The "letter" ones are named like that for historical reasons: `rax` is "accumulator," `rcx` is "counter," `rdx` is "data" and so on — but, of course, they don't have to be used only for that. -There are also 32-, 16-bit and 8-bit registers that have similar names (`rax` → `eax` → `ax` → `al`). They are not fully separate but *aliased*: the first 32 bits of `rax` are `eax`, the first 16 bits of `eax` are `ax`, and so on. This is made to save die space while maintaining compatibility, and it is also the reason why basic type casts in compiled programming languages are usually free. +There are also 32-, 16-bit and 8-bit registers that have similar names (`rax` → `eax` → `ax` → `al`). They are not fully separate but *aliased*: the lowest 32 bits of `rax` are `eax`, the lowest 16 bits of `eax` are `ax`, and so on. This is made to save die space while maintaining compatibility, and it is also the reason why basic type casts in compiled programming languages are usually free. -These are just the *general-purpose* registers that you can, with [some exceptions](../functions), use however you like in most instructions. There is also a separate set of registers for [floating-point arithmetic](/hpc/arithmetic/float), a bunch of very wide registers used in [vector extensions](/hpc/simd), and a few special ones that are needed for [control flow](../jumps), but we'll get there in time. +These are just the *general-purpose* registers that you can, with [some exceptions](../functions), use however you like in most instructions. There is also a separate set of registers for [floating-point arithmetic](/hpc/arithmetic/float), a bunch of very wide registers used in [vector extensions](/hpc/simd), and a few special ones that are needed for [control flow](../loops), but we'll get there in time. -**Constants** are just integer or floating point values: `42`, `0x2a`, `3.14`, `6.02e23`. They are more commonly called *immediate values* because they are embedded right into the machine code. Because it may considerably increase the complexity of the instruction encoding, some instructions don't support immediate values, or allow just a fixed subset of them. In some cases you have to load a constant value into a register and then use it instead of an immediate value. +**Constants** are just integer or floating-point values: `42`, `0x2a`, `3.14`, `6.02e23`. They are more commonly called *immediate values* because they are embedded right into the machine code. Because it may considerably increase the complexity of the instruction encoding, some instructions don't support immediate values or allow just a fixed subset of them. In some cases, you have to load a constant value into a register and then use it instead of an immediate value. Apart from numeric values, there are also string constants such as `hello` or `world\n` with their own little subset of operations, but that is a somewhat obscure corner of the assembly language that we are not going to explore here. ### Moving Data -Some instructions may have the same mnemonic, but have different operand types, in which case they are considered distinct instructions as they may perform slightly different operations and take different time to execute. The `mov` instruction is a vivid example of that, as it comes in around 20 different forms, all related to moving data: either between the memory and registers or just between two registers. Despite the name, it doesn't *move* a value into a register, but *copies* it, preserving the original. +Some instructions may have the same mnemonic, but have different operand types, in which case they are considered distinct instructions as they may perform slightly different operations and take different times to execute. The `mov` instruction is a vivid example of that, as it comes in around 20 different forms, all related to moving data: either between the memory and registers or just between two registers. Despite the name, it doesn't *move* a value into a register, but *copies* it, preserving the original. When used to copy data between two registers, the `mov` instruction instead performs *register renaming* internally — informs the CPU that the value referred by register X is actually stored in register Y — without causing any additional delay except for maybe reading and decoding the instruction itself. For the same reason, the `xchg` instruction that swaps two registers also doesn't cost anything. @@ -88,7 +89,7 @@ Memory addressing is done with the `[]` operator, but it can do more than just r SIZE PTR [base + index * scale + displacement] ``` -where `displacement` needs to be an integer constant and `scale` can be either 2, 4, or 8. What it does is calculates the pointer `base + index * scale + displacement` and dereferences it. +where `displacement` needs to be an integer constant and `scale` can be either 2, 4, or 8. What it does is calculate the pointer `base + index * scale + displacement` and dereferences it. @@ -116,20 +117,18 @@ There are actually multiple *assemblers* (the programs that produce machine code These syntaxes are also sometimes called *GAS* and *NASM* respectively, by the names of the two primary assemblers that use them (*GNU Assembler* and *Netwide Assembler*). -We used Intel syntax in this chapter and will continue to preferably use it for the rest of the book. For comparison, here is how the summation loop looks like in AT&T asm: +We used Intel syntax in this chapter and will continue to preferably use it for the rest of the book. For comparison, here is how the same `*c = *a + *b` example looks like in AT&T asm: ```asm -loop: - addl (%rax), %edx - addq $4, %rax - cmpq %rcx, %rax - jne loop +movl (%rsi), %eax +addl (%rdi), %eax +movl %eax, (%rdx) ``` The key differences can be summarized as follows: 1. The *last* operand is used to specify the destination. -2. Register names and constants need to be prefixed by `%` and `$` respectively. +2. Registers and constants need to be prefixed by `%` and `$` respectively (e.g., `addl $1, %rdx` increments `rdx`). 3. Memory addressing looks like this: `displacement(%base, %index, scale)`. 4. Both `;` and `#` can be used for line comments, and also `/* */` can be used for block comments. diff --git a/content/english/hpc/architecture/functions.md b/content/english/hpc/architecture/functions.md index 908dc2bc..3f98a381 100644 --- a/content/english/hpc/architecture/functions.md +++ b/content/english/hpc/architecture/functions.md @@ -1,6 +1,7 @@ --- title: Functions and Recursion weight: 3 +published: true --- To "call a function" in assembly, you need to [jump](../loops) to its beginning and then jump back. But then two important problems arise: @@ -15,9 +16,9 @@ Both of these concerns can be solved by having a dedicated location in memory wh The hardware stack works the same way software stacks do and is similarly implemented as just two pointers: - The *base pointer* marks the start of the stack and is conventionally stored in `rbp`. -- The *stack pointer* marks the last element on the stack and is conventionally stored in `rsp`. +- The *stack pointer* marks the last element of the stack and is conventionally stored in `rsp`. -When you need to call a function, you push all your local variables onto the stack (which you can also do in other circumstances, e. g. when you run out of registers), push the current instruction pointer, and then jump to the beginning of the function. When exiting from a function, you look at the pointer stored on top of the stack, jump there, and then carefully read all the variables stored on the stack back into their registers. +When you need to call a function, you push all your local variables onto the stack (which you can also do in other circumstances; e.g., when you run out of registers), push the current instruction pointer, and then jump to the beginning of the function. When exiting from a function, you look at the pointer stored on top of the stack, jump there, and then carefully read all the variables stored on the stack back into their registers. -By convention, a function should take its arguments in `rdi`, `rsi`, `rdx`, `rcx`, `r8`, `r9` (and the rest in the stack if that wasn't enough), put the return value into `rax`, and then return. Thus, `square`, being a simple one-argument function, can be implemented like this: +By convention, a function should take its arguments in `rdi`, `rsi`, `rdx`, `rcx`, `r8`, `r9` (and the rest in the stack if those weren't enough), put the return value into `rax`, and then return. Thus, `square`, being a simple one-argument function, can be implemented like this: ```nasm square: ; x = edi, ret = eax @@ -189,7 +190,7 @@ distance: ret ``` -This is better, but we are still implicitly accessing stack memory: you need to push and pop the instruction pointer on each function call. In simple cases like this, we can *inline* function calls by stitching callee's code into the caller and resolving conflicts over registers. In our example: +This is better, but we are still implicitly accessing stack memory: you need to push and pop the instruction pointer on each function call. In simple cases like this, we can *inline* function calls by stitching the callee's code into the caller and resolving conflicts over registers. In our example: ```nasm distance: @@ -229,7 +230,7 @@ Equivalent assembly: ```nasm ; n = edi, ret = eax factorial: - test edi, edi ; test if a value if zero + test edi, edi ; test if a value is zero jne nonzero ; (the machine code of "cmp rax, 0" would be one byte longer) mov eax, 1 ; return 1 ret @@ -249,7 +250,7 @@ To make our `factorial` function tail-recursive, we can pass a "current product" ```cpp int factorial(int n, int p = 1) { if (n == 0) - return 1; + return p; return factorial(n - 1, p * n); } ``` diff --git a/content/english/hpc/architecture/indirect.md b/content/english/hpc/architecture/indirect.md index ce6e86b8..1bd96c06 100644 --- a/content/english/hpc/architecture/indirect.md +++ b/content/english/hpc/architecture/indirect.md @@ -102,11 +102,11 @@ There are many ways to implement this behavior, but C++ does it using a *virtual For all concrete implementations of `Animal`, compiler pads all their methods (that is, their instruction sequences) so that they have the exact same length for all classes (by inserting some [filler instructions](../layout) after `ret`) and then just writes them sequentially somewhere in the instruction memory. Then it adds a *run-time type information* field to the structure (that is, to all its instances), which is essentially just the offset in the memory region that points to the right implementation of the virtual methods of the class. -During a virtual method call, that offset field is fetched from the instance of a structure, and a normal function call is made with it, using the fact that all methods and other fields of every derived class have exactly the same offsets. +With a virtual method call, that offset field is fetched from the instance of a structure and a normal function call is made with it, using the fact that all methods and other fields of every derived class have exactly the same offsets. Of course, this adds some overhead: -- You may need to spend another 15 cycles or so for the same pipeline flushing reasons as for [branch misprediction](../pipelining). +- You may need to spend another 15 cycles or so for the same pipeline flushing reasons as for [branch misprediction](/hpc/pipelining). - The compiler most likely won't be able to inline the function call itself. - Class size increases by a couple of bytes or so (this is implementation-specific). - The binary size itself increases a little bit. diff --git a/content/english/hpc/architecture/isa.md b/content/english/hpc/architecture/isa.md index d109b359..b902f69c 100644 --- a/content/english/hpc/architecture/isa.md +++ b/content/english/hpc/architecture/isa.md @@ -14,7 +14,7 @@ Abstractions help us in reducing all this complexity down to a single *interface Hardware engineers love abstractions too. An abstraction of a CPU is called an *instruction set architecture* (ISA), and it defines how a computer should work from a programmer's perspective. Similar to software interfaces, it gives computer engineers the ability to improve on existing CPU designs while also giving its users — us, programmers — the confidence that things that worked before won't break on newer chips. -An ISA essentially defines how the hardware should interpret the machine language. Apart from instructions and their binary encodings, ISA importantly defines counts, sizes, and purposes of registers, the memory model, and the input/output model. Similar to software interfaces, ISAs can be extended too: in fact, they are often updated, mostly in a backward-compatible way, to add new and more specialized instructions that can improve performance. +An ISA essentially defines how the hardware should interpret the machine language. Apart from instructions and their binary encodings, an ISA also defines the counts, sizes, and purposes of registers, the memory model, and the input/output model. Similar to software interfaces, ISAs can be extended too: in fact, they are often updated, mostly in a backward-compatible way, to add new and more specialized instructions that can improve performance. ### RISC vs CISC @@ -23,7 +23,7 @@ Historically, there have been many competing ISAs in use. But unlike [character - **Arm** chips, which are used in almost all mobile devices, as well as other computer-like devices such as TVs, smart fridges, microwaves, [car autopilots](https://en.wikipedia.org/wiki/Tesla_Autopilot), and so on. They are designed by a British company of the same name, as well as a number of electronics manufacturers including Apple and Samsung. - **x86**[^x86] chips, which are used in almost all servers and desktops, with a few notable exceptions such as Apple's M1 MacBooks, AWS's Graviton processors, and the current [world's fastest supercomputer](https://en.wikipedia.org/wiki/Fugaku_(supercomputer)), all of which use Arm-based CPUs. They are designed by a duopoly of Intel and AMD. -[^x86]: Modern 64-bit versions of x86 are known as "AMD64", "Intel 64", or by the more vendor-neutral names of "x86-64" or just "x64". A similar 64-bit extension of Arm is called "AArch64" or "ARM64". In this book, we will just use plain "x86" and "Arm" implying the 64-bit versions. +[^x86]: Modern 64-bit versions of x86 are known as "AMD64," "Intel 64," or by the more vendor-neutral names of "x86-64" or just "x64." A similar 64-bit extension of Arm is called "AArch64" or "ARM64." In this book, we will just use plain "x86" and "Arm" implying the 64-bit versions. The main difference between them is that of architectural complexity, which is more of a design philosophy rather than some strictly defined property: diff --git a/content/english/hpc/architecture/layout.md b/content/english/hpc/architecture/layout.md index 1ab39c82..df414512 100644 --- a/content/english/hpc/architecture/layout.md +++ b/content/english/hpc/architecture/layout.md @@ -1,6 +1,7 @@ --- title: Machine Code Layout weight: 10 +published: true --- Computer engineers like to mentally split the [pipeline of a CPU](/hpc/pipelining) into two parts: the *front-end*, where instructions are fetched from memory and decoded, and the *back-end*, where they are scheduled and finally executed. Typically, the performance is bottlenecked by the execution stage, and for this reason, most of our efforts in this book are going to be spent towards optimizing around the back-end. @@ -15,7 +16,7 @@ During the **fetch** stage, the CPU simply loads a fixed-size chunk of bytes fro -Next comes the **decode** stage: the CPU looks at this chunk of bytes, discards everything that comes before the instruction pointer, and splits the rest of them into instructions. Machine instructions are encoded using a variable amount of bytes: something simple and very common like `inc rax` takes one byte, while some obscure instruction with encoded constants and behavior-modifying prefixes may take up to 15. So, from a 32-byte block, a variable number of instructions may be decoded, but no more than a certain machine-dependant limit called the *decode width*. On my CPU (a [Zen 2](https://en.wikichip.org/wiki/amd/microarchitectures/zen_2)), the decode width is 4, which means that on each cycle, up to 4 instructions can be decoded and passed to the next stage. +Next comes the **decode** stage: the CPU looks at this chunk of bytes, discards everything that comes before the instruction pointer, and splits the rest of them into instructions. Machine instructions are encoded using a variable number of bytes: something simple and very common like `inc rax` takes one byte, while some obscure instruction with encoded constants and behavior-modifying prefixes may take up to 15. So, from a 32-byte block, a variable number of instructions may be decoded, but no more than a certain machine-dependent limit called the *decode width*. On my CPU (a [Zen 2](https://en.wikichip.org/wiki/amd/microarchitectures/zen_2)), the decode width is 4, which means that on each cycle, up to 4 instructions can be decoded and passed to the next stage. The stages work in a pipelined fashion: if the CPU can tell (or [predict](/hpc/pipelining/branching/)) which instruction block it needs next, then the fetch stage doesn't wait for the last instruction in the current block to be decoded and loads the next one right away. @@ -29,7 +30,7 @@ Loop Stream Detector (LSD) ### Code Alignment -Other things being equal, compilers typically prefer instructions with shorter machine code, because this way more instructions can fit in a single 32B fetch block, and also because it reduces the size of the binary. But sometimes the reverse advice applies, caused by the fact that the fetched instructions blocks have to be aligned. +Other things being equal, compilers typically prefer instructions with shorter machine code, because this way more instructions can fit in a single 32B fetch block, and also because it reduces the size of the binary. But sometimes the reverse is prefereable, due to the fact that the fetched instructions' blocks must be aligned. Imagine that you need to execute an instruction sequence that starts on the last byte of a 32B-aligned block. You may be able to execute the first instruction without additional delay, but for the subsequent ones, you have to wait for one additional cycle to do another instruction fetch. If the code block was aligned on a 32B boundary, then up to 4 instructions could be decoded and then executed concurrently (unless they are extra long or interdependent). @@ -45,15 +46,15 @@ In GCC, you can use `-falign-labels=n` flag to specify a particular alignment po The instructions are stored and fetched using largely the same [memory system](/hpc/cpu-cache) as for the data, except maybe the lower layers of cache are replaced with a separate *instruction cache* (because you wouldn't want a random data read to kick out the code that processes it). -The instruction cache is crucial in situations when you either +The instruction cache is crucial in situations when you either: - don't know what instructions you are going to execute next, and need to fetch the next block with [low latency](/hpc/cpu-cache/latency), -- or executing a long sequence of verbose-but-quick-to-process instructions, and need [high bandwidth](/hpc/cpu-cache/bandwidth). +- or are executing a long sequence of verbose-but-quick-to-process instructions, and need [high bandwidth](/hpc/cpu-cache/bandwidth). The memory system can therefore become the bottleneck for programs with large machine code. This consideration limits the applicability of the optimization techniques we've previously discussed: - [Inlining functions](../functions) is not always optimal, because it reduces code sharing and increases the binary size, requiring more instruction cache. -- [Unrolling loops](../loops) is only beneficial up to some extent, even if the number of loops is known during compile-time: at some point, the CPU would have to fetch both instructions and data from the main memory, in which case it will likely be bottlenecked by the memory bandwidth. +- [Unrolling loops](../loops) is only beneficial up to some extent, even if the number of iterations is known during compile time: at some point, the CPU would have to fetch both instructions and data from the main memory, in which case it will likely be bottlenecked by the memory bandwidth. - Huge [code alignments](#code-alignment) increase the binary size, again requiring more instruction cache. Spending one more cycle on fetch is a minor penalty compared to missing the cache and waiting for the instructions to be fetched from the main memory. Another aspect is that placing frequently used instruction sequences on the same [cache lines](/hpc/cpu-cache/cache-lines) and [memory pages](/hpc/cpu-cache/paging) improves [cache locality](/hpc/external-memory/locality). To improve instruction cache utilization, you should group hot code with hot code and cold code with cold code, and remove dead (unused) code if possible. If you want to explore this idea further, check out Facebook's [Binary Optimization and Layout Tool](https://engineering.fb.com/2018/06/19/data-infrastructure/accelerate-large-scale-applications-with-bolt/), which was recently [merged](https://github.com/llvm/llvm-project/commit/4c106cfdf7cf7eec861ad3983a3dd9a9e8f3a8ae) into LLVM. @@ -126,7 +127,7 @@ normal: ret swap: xchg edi, esi - jump normal + jmp normal ``` This technique is quite handy when handling exceptions cases in general, and in high-level code, you can give the compiler a [hint](/hpc/compilation/situational) that a certain branch is more likely than the other: @@ -152,7 +153,7 @@ length: ret ``` -This is a very important issue, and we will spend [much of the next chapter](/hpc/pipelining/branching) discussing it in more detail. +Eliminating branches is an important topic, and we will spend [much of the next chapter](/hpc/pipelining/branching) discussing it in more detail. diff --git a/content/english/hpc/arithmetic/bit-hacks.md b/content/english/hpc/arithmetic/bit-hacks.md index 5d54b1c1..44a365eb 100644 --- a/content/english/hpc/arithmetic/bit-hacks.md +++ b/content/english/hpc/arithmetic/bit-hacks.md @@ -24,11 +24,11 @@ Left or right-shifting negative numbers invokes undefined behavior in C/C++. `__builtin_popcount` `popcnt` Returns the number of 1-bits in x. -`__builtin_parity` Returns the parity of x, i.e. the number of 1-bits in x modulo 2. +`__builtin_parity` Returns the *parity* of x (that is, the number of 1-bits in x modulo 2). This is presumably for [error detection](https://en.wikipedia.org/wiki/Parity_bit). -`__builtin_clrsb` Returns the number of leading redundant sign bits in x, i.e. the number of bits following the most significant bit that are identical to it. There are no special cases for 0 or other values. +`__builtin_clrsb` Returns the number of leading redundant sign bits in x, i.e., the number of bits following the most significant bit that are identical to it. There are no special cases for 0 or other values. `__builtin_ffs` Returns one plus the index of the least significant 1-bit of x, or if x is zero, returns zero. diff --git a/content/english/hpc/arithmetic/division.md b/content/english/hpc/arithmetic/division.md index 638446b2..0bf44da8 100644 --- a/content/english/hpc/arithmetic/division.md +++ b/content/english/hpc/arithmetic/division.md @@ -3,9 +3,62 @@ title: Integer Division weight: 6 --- -As we know from [the case study of GCD](/hpc/analyzing-performance/gcd/), integer division is painfully slow, even when fully implemented in hardware. Usually we want to avoid doing it in the first place, but when we can't, there are several clever tricks that replace it with multiplication at the cost of a bit of precomputation. +Compared to other arithmetic operations, division works very poorly on x86 and computers in general. Both floating-point and integer division is notoriously hard to implement in hardware. The circuitry takes a lot of space in the ALU, the computation has a lot of stages, and as the result, `div` and its siblings routinely take 10-20 cycles to complete, with latency being slightly less on smaller data type sizes. -All these tricks are based on the following idea. Consider the task of dividing one floating-point number $x$ by another floating-point number $y$, when $y$ is known in advance. What we can do is to calculate a constant +### Division and Modulo in x86 + +Since nobody wants to duplicate all this mess for a separate modulo operation, the `div` instruction serves both purposes. To perform a 32-bit integer division, you need to put the dividend *specifically* in the `eax` register and call `div` with the divisor as its sole operand. After this, the quotient will be stored in `eax` and the remainder will be stored in `edx`. + +The only caveat is that the dividend actually needs to be stored in *two* registers, `eax` and `edx`: this mechanism enables 64-by-32 or even 128-by-64 division, similar to how [128-bit multiplication](../integer) works. When performing the usual 32-by-32 signed division, we need to sign-extend `eax` to 64 bits and store its higher part in `edx`: + +```nasm +div(int, int): + mov eax, edi + cdq + idiv esi + ret +``` + +For unsigned division, you can just set `edx` to zero so that it doesn't interfere: + +```nasm +div(unsigned, unsigned): + mov eax, edi + xor edx, edx + div esi + ret +``` + +An in both cases, in addition to the quotient in `eax`, you can also access the remainder as `edx`: + +```nasm +mod(unsigned, unsigned): + mov eax, edi + xor edx, edx + div esi + mov eax, edx + ret +``` + +You can also divide 128-bit integer (stored in `rdx:rax`) by a 64-bit integer: + +```nasm +div(u128, u64): + ; a = rdi + rsi, b = rdx + mov rcx, rdx + mov rax, rdi + mov rdx, rsi + div edx + ret +``` + +The high part of the dividend should be less than the divisor, otherwise an overflow occurs. Because of this constraint, it is [hard](https://danlark.org/2020/06/14/128-bit-division/) to get compilers to produce this code by themselves: if you divide a [128-bit integer type](../integer) by a 64-bit integer, the compiler will bubble-wrap it with additional checks which may actually be unnecessary. + +### Division by Constants + +Integer division is painfully slow, even when fully implemented in hardware, but it can be avoided in certain cases if the divisor is constant. A well-known example is the division by a power of two, which can be replaced by a one-cycle binary shift: the [binary GCD algorithm](/hpc/algorithms/gcd) is a delightful showcase of this technique. + +In the general case, there are several clever tricks that replace division with multiplication at the cost of a bit of precomputation. All these tricks are based on the following idea. Consider the task of dividing one floating-point number $x$ by another floating-point number $y$, when $y$ is known in advance. What we can do is to calculate a constant $$ d \approx y^{-1} @@ -52,28 +105,28 @@ mov rax, rdx shr rax, 29 ; binary shift of the result ``` -This trick is called *Barrett reduction*, and it's called "reduction" because it is mostly used for modulo operations, which can be replaced with a single division, multiplication and subtraction by the virtue of this formula: +This technique is called *Barrett reduction*, and it's called "reduction" because it is mostly used for modulo operations, which can be replaced with a single division, multiplication and subtraction by the virtue of this formula: $$ r = x - \lfloor x / y \rfloor \cdot y $$ -This method requires some precomputation, including performing one actual division, so this is only beneficial when you do not one, but a few of them, with a constant divisor. +This method requires some precomputation, including performing one actual division. Therefore, this is only beneficial when you perform not just one but a few divisions, all with the same constant divisor. ### Why It Works It is not very clear why such $m$ and $s$ always exist, let alone how to find them. But given a fixed $s$, intuition tells us that $m$ should be as close to $2^s/y$ as possible for $2^s$ to cancel out. So there are two natural choices: $\lfloor 2^s/y \rfloor$ and $\lceil 2^s/y \rceil$. The first one doesn't work, because if you substitute $$ -\lfloor \frac{x \cdot \lfloor 2^s/y \rfloor}{2^s} \rfloor +\Bigl \lfloor \frac{x \cdot \lfloor 2^s/y \rfloor}{2^s} \Bigr \rfloor $$ -then for any integer $\frac{x}{y}$ where $y$ is not even, the result will be stricly less than the truth. This only leaves the other case, $m = \lceil 2^s/y \rceil$. Now, let's try to derive the lower and upper bounds for the result of the computation: +then for any integer $\frac{x}{y}$ where $y$ is not even, the result will be strictly less than the truth. This only leaves the other case, $m = \lceil 2^s/y \rceil$. Now, let's try to derive the lower and upper bounds for the result of the computation: $$ \lfloor x / y \rfloor -= \lfloor \frac{x \cdot m}{2^s} \rfloor -= \lfloor \frac{x \cdot \lceil 2^s /y \rceil}{2^s} \rfloor += \Bigl \lfloor \frac{x \cdot m}{2^s} \Bigr \rfloor += \Bigl \lfloor \frac{x \cdot \lceil 2^s /y \rceil}{2^s} \Bigr \rfloor $$ Let's start with the bounds for $m$: @@ -91,14 +144,14 @@ And now for the whole expression: $$ x / y - 1 < -\lfloor \frac{x \cdot \lceil 2^s /y \rceil}{2^s} \rfloor +\Bigl \lfloor \frac{x \cdot \lceil 2^s /y \rceil}{2^s} \Bigr \rfloor < x / y + x / 2^s $$ We can see that the result falls somewhere in a range of size $(1 + \frac{x}{2^s})$, and if this range always has exactly one integer for all possible $x / y$, then the algorithm is guaranteed to give the right answer. Turns out, we can always set $s$ to be high enough to achieve it. -What will be the worst case here? How to pick $x$ and $y$ so that $(x/y - 1, x/y + x / 2^s)$ contains two integers? We can see that integer ratios don't work, because the left border is not included, and assuming $x/2^s < 1$, only $x/y$ itself will be in the range. The worst case is in actually the $x/y$ that comes closest to $1$ without exceeding it. For $n$-bit integers, that is the second largest possible integer divided by the first largest: +What will be the worst case here? How to pick $x$ and $y$ so that the $(x/y - 1, x/y + x / 2^s)$ range contains two integers? We can see that integer ratios don't work because the left border is not included, and assuming $x/2^s < 1$, only $x/y$ itself will be in the range. The worst case is actually the $x/y$ that comes closest to $1$ without exceeding it. For $n$-bit integers, that is the second-largest possible integer divided by the first-largest: $$ \begin{aligned} @@ -111,7 +164,7 @@ In this case, the lower bound will be $(\frac{2^n-2}{2^n-1} - 1)$ and the upper ### Lemire Reduction -Barrett reduction is a bit complicated, and also generates a length instruction sequence for modulo because it is computed indirectly. There a new ([2019](https://arxiv.org/pdf/1902.01961.pdf)) method, which is simpler and actually faster for modulo in some cases. It doesn't have a conventional name yet, but I am going to refer to it as [Lemire](https://lemire.me/blog/) reduction. +Barrett reduction is a bit complicated, and also generates a length instruction sequence for modulo because it is computed indirectly. There is a new ([2019](https://arxiv.org/pdf/1902.01961.pdf)) method, which is simpler and actually faster for modulo in some cases. It doesn't have a conventional name yet, but I am going to refer to it as [Lemire](https://lemire.me/blog/) reduction. Here is the main idea. Consider the floating-point representation of some integer fraction: @@ -129,24 +182,24 @@ Now, for 32-bit integers, we can set $s = 64$ and look at the computation that w $$ \lfloor x / y \rfloor -= \lfloor \frac{x \cdot m}{2^s} \rfloor -= \lfloor \frac{x \cdot \lceil 2^s /y \rceil}{2^s} \rfloor += \Bigl \lfloor \frac{x \cdot m}{2^s} \Bigr \rfloor += \Bigl \lfloor \frac{x \cdot \lceil 2^s /y \rceil}{2^s} \Bigr \rfloor $$ What we really do here is we multiply $x$ by a floating-point constant ($x \cdot m$) and then truncate the result $(\lfloor \frac{\cdot}{2^s} \rfloor)$. -What if we took not the highest bits, but the lowest? This would correspond to the fractional part — and if we multiply it back by $y$ and truncate the result, this will be exactly the remainder: +What if we took not the highest bits but the lowest? This would correspond to the fractional part — and if we multiply it back by $y$ and truncate the result, this will be exactly the remainder: $$ r = \Bigl \lfloor \frac{ (x \cdot \lceil 2^s /y \rceil \bmod 2^s) \cdot y }{2^s} \Bigr \rfloor $$ -This works perfectly, because what we do here can be interpreted as just three chained floating-point multiplications with the total relative error of $O(\epsilon)$. Since $\epsilon = O(\frac{1}{2^s})$ and $s = 2n$, the error will always be less than one, and hence the result will be exact. +This works perfectly because what we do here can be interpreted as just three chained floating-point multiplications with the total relative error of $O(\epsilon)$. Since $\epsilon = O(\frac{1}{2^s})$ and $s = 2n$, the error will always be less than one, and hence the result will be exact. ```c++ uint32_t y; -uint64_t m = uint64_t(-1) / y + 1; // ceil(2^64 / d) +uint64_t m = uint64_t(-1) / y + 1; // ceil(2^64 / y) uint32_t mod(uint32_t x) { uint64_t lowbits = m * x; @@ -158,6 +211,20 @@ uint32_t div(uint32_t x) { } ``` +We can also check divisibility of $x$ by $y$ with just one multiplication using the fact that the remainder of division is zero if and only if the fractional part (the lower 64 bits of $m \cdot x$) does not exceed $m$ (otherwise, it would become a nonzero number when multiplied back by $y$ and right-shifted by 64). + +```c++ +bool is_divisible(uint32_t x) { + return m * x < m; +} +``` + The only downside of this method is that it needs integer types four times the original size to perform the multiplication, while other reduction methods can work with just the double. -There is a a way though to compute 64x64 modulo by carefully manipulating the halves of intermediate results; the implementation is left as an exercise to the reader. +There is also a way to compute 64x64 modulo by carefully manipulating the halves of intermediate results; the implementation is left as an exercise to the reader. + +### Further Reading + +Check out [libdivide](https://github.com/ridiculousfish/libdivide) and [GMP](https://gmplib.org/) for more general implementations of optimized integer division. + +It is also worth reading [Hacker's Delight](https://www.amazon.com/Hackers-Delight-2nd-Henry-Warren/dp/0321842685), which has a whole chapter dedicated to integer division. diff --git a/content/english/hpc/arithmetic/errors.md b/content/english/hpc/arithmetic/errors.md index d40276ab..47d5d42d 100644 --- a/content/english/hpc/arithmetic/errors.md +++ b/content/english/hpc/arithmetic/errors.md @@ -1,33 +1,63 @@ --- title: Rounding Errors weight: 2 +published: true --- -The way rounding works in hardware floats is remarkably simple: it occurs if and only if the result of the operation is not representable exactly, and by default gets rounded to the nearest representable number (and to the nearest zero-ending number in case of a tie). +The way rounding works in hardware floats is remarkably simple: it occurs if and only if the result of the operation is not representable exactly, and by default gets rounded to the nearest representable number (in case of a tie preferring the number that ends with a zero). + +Consider the following code snippet: + +```c++ +float x = 0; +for (int i = 0; i < (1 << 25); i++) + x++; +printf("%f\n", x); +``` + +Instead of printing $2^{25} = 33554432$ (what the result mathematically should be), it outputs $16777216 = 2^{24}$. Why? + +When we repeatedly increment a floating-point number $x$, we eventually hit a point where it becomes so big that $(x + 1)$ gets rounded back to $x$. The first such number is $2^{24}$ (the number of mantissa bits plus one) because + +$$2^{24} + 1 = 2^{24} \cdot 1.\underbrace{0\ldots0}_{\times 23} 1$$ + +has the exact same distance from $2^{24}$ and $(2^{24} + 1)$ but gets rounded down to $2^{24}$ by the above-stated tie-breaker rule. At the same time, the increment of everything lower than that can be represented exactly, so no rounding happens in the first place. + +### Rounding Errors and Operation Order + +The result of a floating-point computation may depend on the order of operations despite being algebraically correct. + +For example, while the operations of addition and multiplication are commutative and associative in the pure mathematical sense, their rounding errors are not: when we have three floating-point variables $x$, $y$, and $z$, the result of $(x+y+z)$ depends on the order of summation. The same non-commutativity principle applies to most if not all other floating-point operations. + +Compilers are not allowed to produce [non-spec-compliant](/hpc/compilation/contracts/) results, so this annoying nuance disables some potential optimizations that involve rearranging operands in arithmetic. You can disable this strict compliance with the `-ffast-math` flag in GCC and Clang. If we add it and re-compile the code snippet above, it runs [considerably faster](/hpc/simd/reduction) and also happens to output the correct result, 33554432 (although you need to be aware that the compiler also could have chosen a less precise computation path). + +### Rounding Modes Apart from the default mode (also known as Banker's rounding), you can [set](https://www.cplusplus.com/reference/cfenv/fesetround/) other rounding logic with 4 more modes: -* round to nearest, with ties always rounding "away" from zero; -* round up (toward $+∞$; negative results thus round toward zero); -* round down (toward $-∞$; negative results thus round away from zero); -* round toward zero (truncation of the binary result). +- round to nearest, with perfect ties always rounding "away" from zero; +- round up (toward $+∞$; negative results thus round toward zero); +- round down (toward $-∞$; negative results thus round away from zero); +- round toward zero (a truncation of the binary result). + +For example, if you call `fesetround(FE_UPWARD)` before running the loop above, it outputs not $2^{24}$, and not even $2^{25}$, but $67108864 = 2^{26}$. This happens because when we get to $2^{24}$, $(x + 1)$ starts rounding to the next nearest representable number $(x + 2)$, and we reach $2^{25}$ in half the time, and after that, $(x + 1)$ rounds up to $(x+4)$, and we start going four times as fast. -The alternative rounding modes are also useful in diagnosing numerical instability. If the results of a subroutine vary substantially between rounding to the positive and negative infinities, then it indicates susceptibility to round-off errors. Is a better test than switching all computations to a lower precision and checking whether the result changed by too much, because the default rounding to nearest results in the right "expected" value given enough averaging: statistically, half of the time they are rounding up and the other are rounding down, so they cancel each other. +One of the uses for the alternative rounding modes is for diagnosing numerical instability. If the results of an algorithm substantially vary when switching between rounding to the positive and negative infinities, it indicates susceptibility to round-off errors. -Note that while most operations with real numbers are commutative and associative, their rounding errors are not: even the result of $(x+y+z)$ depends on the order of summation. Compilers are not allowed to produce non-spec-compliant results, so this disables some potential optimizations that involve rearranging operands. You can disable this strict compliance with the `-ffast-math` flag in GCC and Clang, although you need to be aware that this lets compilers sometimes choose less precise computation paths. +This test is often better than switching all computations to lower precision and checking whether the result changed by too much because the default round-to-nearest policy converges to the correct “expected” value given enough averaging: half of the time the errors are rounding up, and the other they are rounding down — so, statistically, they cancel each other. -It seems surprising to expect this guarantee from hardware that performs complex calculations such as natural logarithms and square roots, but this is it: you guaranteed to get the highest precision possible from all operations. This makes it remarkably easy to analyze round-off errors, as we will see in a bit. +### Measuring Errors -## Measuring and Mitigating Errors +It seems surprising to expect this guarantee from hardware that performs complex calculations such as natural logarithms and square roots, but this is it: you are guaranteed to get the highest precision possible from all operations. This makes it remarkably easy to analyze round-off errors, as we will see in a bit. There are two natural ways to measure computational errors: -* The engineers who create hardware or spec-compliant exact software are concerned with *units in the last place* (ulps), which is the distance between two numbers in terms of how many representable numbers can fit between the precise real value and the actual result of computation. +* The engineers who create hardware or spec-compliant exact software are concerned with *units in the last place* (ulps), which is the distance between two numbers in terms of how many representable numbers can fit between the precise real value and the actual result of the computation. * People that are working on numerical algorithms care about *relative precision*, which is the absolute value of the approximation error divided by the real answer: $|\frac{v-v'}{v}|$. In either case, the usual tactic to analyze errors is to assume the worst case and simply bound them. -If you perform a single basic arithmetic operation, then the worst thing that can happen is the result rounding to the nearest representable number, meaning that the error in this case does not exceed 0.5 ulps. To reason about relative errors the same way, we can define a number $\epsilon$ called *machine epsilon*, equal to the difference between $1$ and the next representable value (which should be equal to 2 to the negative power of however many bits are dedicated to mantissa). +If you perform a single basic arithmetic operation, then the worst thing that can happen is the result rounding to the nearest representable number, meaning that the error does not exceed 0.5 ulps. To reason about relative errors the same way, we can define a number $\epsilon$ called *machine epsilon*, equal to the difference between $1$ and the next representable value (which should be equal to 2 to the negative power of however many bits are dedicated to mantissa). This means that if after a single arithmetic operation you get result $x$, then the real value is somewhere in the range @@ -44,11 +74,11 @@ bool eq(float a, float b) { } ``` -The value of epsilon should depend on the application: the one above — the machine epsilon for `float` — is only good for no more than one floating-point operation. +The value of `eps` should depend on the application: the one above — the machine epsilon for `float` — is only good for no more than one floating-point operation. ### Interval Arithmetic -An algorithm is called *numerically stable* if its error, whatever its cause, does not grow to be much larger during the calculation. This happens if the problem is *well-conditioned*, meaning that the solution changes by only a small amount if the problem data are changed by a small amount. +An algorithm is called *numerically stable* if its error, whatever its cause, does not grow much larger during the calculation. This can only happen if the problem itself is *well-conditioned*, meaning that the solution changes only by a small amount if the input data are changed by a small amount. When analyzing numerical algorithms, it is often useful to adopt the same method that is used in experimental physics: instead of working with unknown real values, we will work with the intervals where they may be in. @@ -60,7 +90,7 @@ for (int i = 0; i < n; i++) x *= a[i]; ``` -After the first multiplication, the value of $x$ relative to the value of the real product is bounded by $(1 + \epsilon)$, and after each additional multiplication this upper bound is multiplied by another $(1 + \epsilon)$. By induction, after $n$ multiplications, the computed value is bound by $(1 + \epsilon)^n = 1 + n \epsilon + O(\epsilon^2)$ and a similar lower bound. +After the first multiplication, the value of $x$ relative to the value of the real product is bounded by $(1 + \epsilon)$, and after each additional multiplication, this upper bound is multiplied by another $(1 + \epsilon)$. By induction, after $n$ multiplications, the computed value is bound by $(1 + \epsilon)^n = 1 + n \epsilon + O(\epsilon^2)$ and a similar lower bound. This implies that the relative error is $O(n \epsilon)$, which is sort of okay, because usually $n \ll \frac{1}{\epsilon}$. @@ -90,17 +120,17 @@ $$ If $x$ and $y$ are close in magnitude, the error will be $O(\epsilon \cdot |x|)$. -Under direct computation, the subtraction "magnifies" the errors of the squaring. But this can be fixed by instead using the following formula: +Under direct computation, the subtraction "magnifies" the errors of squaring. But this can be fixed by instead using the following formula: $$ f(x, y) = x^2 - y^2 = (x + y) \cdot (x - y) $$ -In this one, it is easy to show that the error is be bound by $\epsilon \cdot |x - y|$. It is also faster because it needs 2 additions and 1 multiplication: one fast addition more and one slow multiplication less compared to the original. +In this one, it is easy to show that the error is bound by $\epsilon \cdot |x - y|$. It is also faster because it needs 2 additions and 1 multiplication: one fast addition more and one slow multiplication less compared to the original. ### Kahan Summation -From previous example, we can see that long chains of operations are not a problem, but adding and subtracting numbers of different magnitude is. The general approach to dealing with such problems is to try to keep big numbers with big numbers and low numbers with low numbers. +From the previous example, we can see that long chains of operations are not a problem, but adding and subtracting numbers of different magnitude is. The general approach to dealing with such problems is to try to keep big numbers with big numbers and small numbers with small numbers. Consider the standard summation algorithm: @@ -112,7 +142,7 @@ for (int i = 0; i < n; i++) Since we are performing summations and not multiplications, its relative error is no longer just bounded by $O(\epsilon \cdot n)$, but heavily depends on the input. -In the most ridiculous case, if the first value is $2^{23}$ and the others are ones, the sum is going to be $2^{23}$ regardless of $n$, which can be verified by executing the following code and observing that it simply prints $16777216 = 2^{23}$ twice: +In the most ridiculous case, if the first value is $2^{24}$ and the other values are equal to $1$, the sum is going to be $2^{24}$ regardless of $n$, which can be verified by executing the following code and observing that it simply prints $16777216 = 2^{24}$ twice: ```cpp const int n = (1<<24); @@ -125,7 +155,7 @@ for (int i = 0; i < n; i++) printf("%f\n", s); ``` -This happens because `float` has only 23 mantissa bits, and so $2^{23} + 1$ is the first integer number that can't be represented exactly and has to be rounded down, which happens every time we try to add $1$ to $s = 2^{23}$. The error is indeed $O(n \cdot \epsilon)$ but in terms of the absolute error, not the relative one: in the example above, it is $2$, and it would go up to infinity if the last number happened to be $-2^{23}$. +This happens because `float` has only 23 mantissa bits, and so $2^{24} + 1$ is the first integer number that can't be represented exactly and has to be rounded down, which happens every time we try to add $1$ to $s = 2^{24}$. The error is indeed $O(n \cdot \epsilon)$ but in terms of the absolute error, not the relative one: in the example above, it is $2$, and it would go up to infinity if the last number happened to be $-2^{24}$. The obvious solution is to switch to a larger type such as `double`, but this isn't really a scalable method. An elegant solution is to store the parts that weren't added in a separate variable, which is then added to the next variable: @@ -141,7 +171,7 @@ for (int i = 0; i < n; i++) { This trick is known as *Kahan summation*. Its relative error is bounded by $2 \epsilon + O(n \epsilon^2)$: the first term comes from the very last summation, and the second term is due to the fact that we work with less-than-epsilon errors on each step. -Of course, a more general approach would be to switch to a more precise data type, like `double`, either way effectively squaring the machine epsilon. It can sort of be scaled by bundling two `double` variable together ne for storing the value, and another for its non-representable errors, so that they actually represent $a+b$. This approach is known as *double-double* arithmetic, and can be similarly generalized to define quad-double and higher precision arithmetic. +Of course, a more general approach that works not just for array summation would be to switch to a more precise data type, like `double`, also effectively squaring the machine epsilon. Furthermore, it can (sort of) be scaled by bundling two `double` variables together: one for storing the value and another for its non-representable errors so that they represent the value $a+b$. This approach is known as double-double arithmetic, and it can be similarly generalized to define quad-double and higher precision arithmetic. - diff --git a/content/english/hpc/arithmetic/float.md b/content/english/hpc/arithmetic/float.md index 2fe80f4c..dcc33039 100644 --- a/content/english/hpc/arithmetic/float.md +++ b/content/english/hpc/arithmetic/float.md @@ -9,7 +9,7 @@ The users of floating-point arithmetic deserve one of these IQ bell curve memes - Then they discover that `0.1 + 0.2 != 0.3` or some other quirk like that, freak out, start thinking that some random error term is added to every computation, and for many years avoid any real data types completely. - Then they finally man up, read the specification of how IEEE-754 floats work and start using them appropriately. -Most people are unfortunately still at stage 2, breeding various misconceptions about floating-point arithmetic — thinking that it is fundamentally imprecise and unstable, and slower than integer arithmetic. +Unfortunately, too many people are still at stage 2, breeding various misconceptions about floating-point arithmetic — thinking that it is fundamentally imprecise and unstable, and slower than integer arithmetic. ![](../img/iq.svg) @@ -117,7 +117,7 @@ struct fp { }; ``` -This way we can represent numbers in the form $\pm \\; m \times 2^e$ where both $m$ and $e$ are bounded *and possibly negative* integers — which would correspond to negative or small numbers respectively. The distribution of these numbers is very much non-uniform: there are as many numbers in the $[0, 1]$ range as in the $[0, +\infty)$ range. +This way we can represent numbers in the form $\pm \\; m \times 2^e$ where both $m$ and $e$ are bounded *and possibly negative* integers — which would correspond to negative or small numbers respectively. The distribution of these numbers is very much non-uniform: there are roughly as many numbers in the $[0, 1)$ range as in the $[1, +\infty)$ range. Note that these representations are not unique for some numbers. For example, number $1$ can be represented as @@ -127,7 +127,7 @@ $$ and in 28 other ways that don't overflow the mantissa. -This can be problematic for some applications, such as comparisons or hashing. To fix this, we can *normalize* these representations using a certain convention. In decimal, the [standard form](https://en.wikipedia.org/wiki/Scientific_notation) is to always put the comma after the first digit (`6.022e23`), and for binary we can do the same: +This can be problematic for some applications, such as comparisons or hashing. To fix this, we can *normalize* these representations using a certain convention. In decimal, the [standard form](https://en.wikipedia.org/wiki/Scientific_notation) is to always put the comma after the first digit (`6.022e23`), and for binary, we can do the same: $$ 42 = 10101_2 = 1.0101_2 \times 2^5 @@ -139,7 +139,7 @@ $$ \{ \pm \; (1 + m) \cdot 2^e \; | \; m = \frac{x}{2^{32}}, \; x \in [0, 2^{32}) \} $$ -Since $m$ is now a nonnegative value, we will now make it unsigned integer, and instead add a separate boolean field for the sign of the number: +Since $m$ is now a nonnegative value, we will now make it unsigned integer, and instead add a separate Boolean field for the sign of the number: ```cpp struct fp { @@ -187,108 +187,6 @@ fp operator*(fp a, fp b) { } ``` -Many applications that require higher levels of precision use software floating-point arithmetic in a similar fashion. Buf of course, you don't want to execute a sequence 10 or so instructions that this code compiles to each time you want to multiply two real numbers, so floating-point arithmetic is implemented in hardware — often in separate coprocessors due to its complexity. +Many applications that require higher levels of precision use software floating-point arithmetic in a similar fashion. But of course, you don't want to execute a sequence of 10 or so instructions that this code compiles to each time you want to multiply two real numbers, so on modern CPUs, floating-point arithmetic is implemented in hardware — usually as separate coprocessors due to its complexity. -The FPU of x86 (often referred to as x87) has separate registers and its own tiny instruction set that supports memory operations, basic arithmetic, trigonometry and some common operations such logarithm, exponent and square root. - -## IEEE 754 Floats - -When we designed our DIY floating-point type, we omitted quite a lot of important little details: - -- How many bits do we dedicate for the mantissa and the exponent? -- Does a "0" sign bit means "+" or is it the other way around? -- How are these bits stored in memory? -- How do we represent 0? -- How exactly does rounding happen? -- What happens if we divide by zero? -- What happens if we take a square root of a negative number? -- What happens if increment the largest representable number? -- Can we somehow detect if one of the above three happened? - -Most of the early computers didn't have floating-point arithmetic, and when vendors started adding floating-point coprocessors, they had slightly different vision for what answers to those questions should be. Diverse implementations made it difficult to use floating-point arithmetic reliably and portably — particularly for people developing compilers. - -In 1985, the Institute of Electrical and Electronics Engineers published a standard (called [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754)) that provided a formal specification of how floating-point numbers should work, which was quickly adopted by the vendors and is now used in virtually all general-purpose computers. - -### Float Formats - -Similar to our handmade float implementation, hardware floats use one bit for sign and a variable number of bits for exponent and mantissa. For example, the standard 32-bit `float` encoding uses the first (highest) bit for sign, the next 8 bits for exponent, and the 23 remaining bits for mantissa. - -![](../img/float.svg) - -One of the reasons why they are stored in this exact order is so that it would be easier to compare and sort them: you can simply use largely the same comparator circuit as for [unsigned integers](../integer) — except for maybe flipping the bits in the case of negative numbers. - -IEEE 754 and a few consequent standards define not one, but *several* representations that differ in sizes, most notably: - -| Type | Sign | Exponent | Mantissa | Total bits | Approx. decimal digits | -|----------:|------|----------|----------|------------|------------------------| -| single | 1 | 8 | 23 | 32 | ~7.2 | -| double | 1 | 11 | 52 | 64 | ~15.9 | -| half | 1 | 5 | 10 | 16 | ~3.3 | -| extended | 1 | 15 | 64 | 80 | ~19.2 | -| quadruple | 1 | 15 | 112 | 128 | ~34.0 | -| bfloat16 | 1 | 8 | 7 | 16 | ~2.3 | - -Their availability ranges from chip to chip: - -- Most CPUs support single- and double-precision — which is what `float` and `double` types refer to in C. -- Extended formats are exclusive to x86, and are available in C as the `long double` type, which falls back to double precision on arm. The choice of 64 bits for mantissa is so that every `long long` integer can be represented exactly. There is also a 40-bit format that similarly allocates 32 mantissa bits. -- Quadruple as well as the 256-bit "octuple" formats are only used for specific scientific computations and are not supported by general-purpose hardware. -- Half-precision arithmetic only supports a small subset of operations, and is generally used for machine learning applications, especially neural networks, because they tend to do a large amount of calculation, but don't require a high level of precision. -- Half-precision is being gradually replaced by bfloat, which trades off 3 mantissa bits to have the same range as single-precision, enabling interoperability with it. It is mostly being adopted by specialized hardware: TPUs, FGPAs and GPUs. The name stands for "[Brain](https://en.wikipedia.org/wiki/Google_Brain) float". - -Lower precision types need less memory bandwidth to move them around and usually take less cycles to operate on (e. g. the division instruction may take $x$, $y$, or $z$ cycles depending on the type), which is why they are preferred when error tolerance allows it. - -Deep learning, emerging as a very popular and computationally-intensive field, created a huge demand for low-precision matrix multiplication, which led to manufacturers developing separate hardware or at least adding specialized instructions that support these types of computations — most notably, Google developing a custom chip called TPU (*tensor processing unit*) that specializes on multiplying 128-by-128 bfloat matrices, and NVIDIA adding "tensor cores", capable of performing 4-by-4 matrix multiplication in one go, to all their newer GPUs. - -Apart from their sizes, most of behavior is exactly the same between all floating-point types, which we will now clarify. - -## Handling Corner Cases - -The default way integer arithmetic deals with corner cases such as division by zero is to crash. - -Sometimes a software crash in turn causes a real, physical one. In 1996, the maiden flight of the [Ariane 5](https://en.wikipedia.org/wiki/Ariane_5) (the space launch vehicle that ESA uses to lift stuff into low Earth orbit) ended in [a catastrophic explosion](https://www.youtube.com/watch?v=gp_D8r-2hwk) due to the policy of aborting computation on arithmetic error, which in this case was a floating-point to integer conversion overflow, that led to the navigation system thinking that it was off course and making a large correction, eventually causing the disintegration of a $1B rocket. - -There is a way to gracefully handle corner cases such like these: hardware interrupts. When an exception occurs, CPU: - -- interrupts the execution of a program; -- packs every all relevant information into a data structure called "interrupt vector"; -- passes it to the operating system, which in turn either calls the handling code if it exists (the "try-except" block) or terminates the program otherwise. - -This is a complex mechanism that deserves an article of its own, but since this is a book about performance, the only thing you need to know is that they are quite slow and not desirable in a real-time systems such as navigating rockets. - -### NaNs and Infinities - -Floating-point arithmetic often deals with noisy, real-world data, and exceptions there are much more common than in the integer case. For this reason, the default behavior is different. Instead of crashing, the result is substituted with a special value without interrupting the executing, unless the programmer explicitly wants to. - -The first type of such values are the two infinities: a positive and a negative one. They are generated if the result of an operation can't fit within in the representable range, and they are treated as such in arithmetic. - -$$ -\begin{aligned} - -∞ < x &< ∞ -\\ ∞ + x &= ∞ -\\ x ÷ ∞ &= 0 -\end{aligned} -$$ - -What happens if we, say, divide a value by zero? Should it be a negative or a positive infinity? This case in actually unambiguous because, somewhat less intuitively, there are also two zeros: a positive and a negative one. - -$$ - \frac{1}{+0} = +∞ -\;\;\;\; \frac{1}{-0} = -∞ -$$ - -Zeros are encoded by setting all bits to zero, except for the sign bit in the negative case. Infinities are encoded by setting all their exponent bits to one and all mantissa bits to zero, but the sign bit distinguishing between a positive and a negative infinity. - -The other type is the "not-a-number” (NaN), which is generated as the result of mathematically incorrect operations: - -$$ -\log(-1),\; \arccos(1.01),\; ∞ − ∞,\; −∞ + ∞,\; 0 × ∞,\; 0 ÷ 0,\; ∞ ÷ ∞ -$$ - -There are two types of NaNs: a signalling NaN and a quiet NaN. A signalling NaN raises an exception flag, which may or may not cause an immediate hardware interrupt because on FPU configuration, while a quiet NaN just propagates through almost every arithmetic operation, resulting in more NaNs. - -Both NaNs are encoded as all their exponent set to ones and the mantissa part being everything other than all zeroes (to distinguish them from infinities). - -## Further Reading - -If you are so inclined, you can read the classic "[What Every Computer Scientist Should Know About Floating-Point Arithmetic](https://www.itu.dk/~sestoft/bachelor/IEEE754_article.pdf)" (1991) and [the paper introducing Grisu3](https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf), the current state-of-the art for printing floating-point numbers. +The *floating-point unit* of x86 (often referred to as x87) has separate registers and its own tiny instruction set that supports memory operations, basic arithmetic, trigonometry, and some common operations such as logarithm, exponent, and square root. To make these operations properly work together, some additional details of floating-point number representation need to be clarified — which we will do in [the next section](../ieee-754). diff --git a/content/english/hpc/arithmetic/ieee-754.md b/content/english/hpc/arithmetic/ieee-754.md new file mode 100644 index 00000000..7787b589 --- /dev/null +++ b/content/english/hpc/arithmetic/ieee-754.md @@ -0,0 +1,115 @@ +--- +title: IEEE 754 Floats +weight: 2 +--- + +When we designed our [DIY floating-point type](../float), we omitted quite a lot of important little details: + +- How many bits do we dedicate for the mantissa and the exponent? +- Does a `0` sign bit mean `+`, or is it the other way around? +- How are these bits stored in memory? +- How do we represent 0? +- How exactly does rounding happen? +- What happens if we divide by zero? +- What happens if we take the square root of a negative number? +- What happens if we increment the largest representable number? +- Can we somehow detect if one of the above three happened? + +Most of the early computers didn't support floating-point arithmetic, and when vendors started adding floating-point coprocessors, they had slightly different visions for what the answers to these questions should be. Diverse implementations made it difficult to use floating-point arithmetic reliably and portably — especially for the people who develop compilers. + +In 1985, the Institute of Electrical and Electronics Engineers published a standard (called [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754)) that provided a formal specification of how floating-point numbers should work, which was quickly adopted by the vendors and is now used in virtually all general-purpose computers. + +## Float Formats + +Similar to our handmade float implementation, hardware floats use one bit for sign and a variable number of bits for the exponent and the mantissa parts. For example, the standard 32-bit `float` encoding uses the first (highest) bit for sign, the next 8 bits for the exponent, and the 23 remaining bits for the mantissa. + +![](../img/float.svg) + +One of the reasons why they are stored in this exact order is that it is easier to compare and sort them: you can use mostly the same comparator circuit as for [unsigned integers](../integer), except for maybe flipping some bits in case one of the numbers is negative. + +For the same reason, the exponent is *biased:* the actual value is 127 less than the stored unsigned integer, which lets us also cover the values less than one (with negative exponents). In the example above: + +$$ +(-1)^0 \times 2^{01111100_2 - 127} \times (1 + 2^{-2}) += 2^{124 - 127} \times 1.25 += \frac{1.25}{8} += 0.15625 +$$ + +IEEE 754 and a few consequent standards define not one but *several* representations that differ in sizes, most notably: + +| Type | Sign | Exponent | Mantissa | Total bits | Approx. decimal digits | +|----------:|------|----------|----------|------------|------------------------| +| single | 1 | 8 | 23 | 32 | ~7.2 | +| double | 1 | 11 | 52 | 64 | ~15.9 | +| half | 1 | 5 | 10 | 16 | ~3.3 | +| extended | 1 | 15 | 64 | 80 | ~19.2 | +| quadruple | 1 | 15 | 112 | 128 | ~34.0 | +| bfloat16 | 1 | 8 | 7 | 16 | ~2.3 | + +Their availability ranges from chip to chip: + +- Most CPUs support single- and double-precision — which is what `float` and `double` types refer to in C. +- Extended formats are exclusive to x86, and are available in C as the `long double` type, which falls back to double precision on Arm CPUs. The choice of 64 bits for mantissa is so that every `long long` integer can be represented exactly. There is also a 40-bit format that similarly allocates 32 mantissa bits. +- Quadruple as well as the 256-bit "octuple" formats are only used for specific scientific computations and are not supported by general-purpose hardware. +- Half-precision arithmetic only supports a small subset of operations and is generally used for applications such as machine learning, especially neural networks, because they tend to perform large amounts of calculations but don't require high levels of precision. +- Half-precision is being gradually replaced by bfloat, which trades off 3 mantissa bits to have the same range as single-precision, enabling interoperability with it. It is mostly being adopted by specialized hardware: TPUs, FGPAs, and GPUs. The name stands for "[Brain](https://en.wikipedia.org/wiki/Google_Brain) float." + +Lower-precision types need less memory bandwidth to move them around and usually take fewer cycles to operate on (e.g., the division instruction may take $x$, $y$, or $z$ cycles depending on the type), which is why they are preferred when error tolerance allows it. + +Deep learning, emerging as a very popular and computationally-intensive field, created a huge demand for low-precision matrix multiplication, which led to manufacturers developing separate hardware or at least adding specialized instructions that support these types of computations — most notably, Google developing a custom chip called TPU (*tensor processing unit*) that specializes on multiplying 128-by-128 bfloat matrices, and NVIDIA adding "tensor cores," capable of performing 4-by-4 matrix multiplication in one go, to all their newer GPUs. + +Apart from their sizes, most of the behavior is the same between all floating-point types, which we will now clarify. + +## Handling Corner Cases + +The default way integer arithmetic deals with corner cases such as division by zero is to crash. + +Sometimes a software crash, in turn, causes a real, physical one. In 1996, the maiden flight of the [Ariane 5](https://en.wikipedia.org/wiki/Ariane_5) (the space launch vehicle that ESA uses to lift stuff into low Earth orbit) ended in [a catastrophic explosion](https://www.youtube.com/watch?v=gp_D8r-2hwk) due to the policy of aborting computation on arithmetic error, which in this case was a floating-point to integer conversion overflow, that led to the navigation system thinking that it was off course and making a large correction, eventually causing the disintegration of a $200M rocket. + +There is a way to gracefully handle corner cases like these: hardware interrupts. When an exception occurs, the CPU + +- interrupts the execution of a program; +- packs all relevant information into a data structure called "interrupt vector"; +- passes it to the operating system, which in turn either calls the handling code if it exists (the "try-except" block) or terminates the program otherwise. + +This is a complex mechanism that deserves an article of its own, but since this is a book about performance, the only thing you need to know is that they are quite slow and not desirable in real-time systems such as navigating rockets. + +### NaNs, Zeros and Infinities + +Floating-point arithmetic often deals with noisy, real-world data. Exceptions there are much more common than in the integer case, and for this reason, the default behavior when handling them is different. Instead of crashing, the result is substituted with a special value without interrupting the program execution (unless the programmer explicitly wants it to). + +The first type of such value is the two infinities: a positive and a negative one. They are generated if the result of an operation can't fit within the representable range, and they are treated as such in arithmetic. + +$$ +\begin{aligned} + -∞ < x &< ∞ +\\ ∞ + x &= ∞ +\\ x ÷ ∞ &= 0 +\end{aligned} +$$ + +What happens if we, say, divide a value by zero? Should it be a negative or a positive infinity? This case is actually unambiguous because, somewhat less intuitively, there are also two zeros: a positive and a negative one. + +$$ + \frac{1}{+0} = +∞ +\;\;\;\; \frac{1}{-0} = -∞ +$$ + +Fun fact: `x + 0.0` can't be folded to `x`, but `x + (-0.0)` can, so the negative zero is a better initializer value than the positive zero as it is more likely to be optimized away by the compiler. The reason why `+0.0` doesn't work is that IEEE says that `+0.0 + -0.0 == +0.0`, so it will give a wrong answer for `x = -0.0`. The presence of two zeros frequently causes headaches like this — good news that you can pass `-fno-signed-zeros` to the compiler if you want to disable this behavior. + +Zeros are encoded by setting all bits to zero, except for the sign bit in the negative case. Infinities are encoded by setting all their exponent bits to one and all mantissa bits to zero, with the sign bit distinguishing between positive and negative infinity. + +The other type is the "not-a-number” (NaN), which is generated as the result of mathematically incorrect operations: + +$$ +\log(-1),\; \arccos(1.01),\; ∞ − ∞,\; −∞ + ∞,\; 0 × ∞,\; 0 ÷ 0,\; ∞ ÷ ∞ +$$ + +There are two types of NaNs: a *signaling NaN* and a *quiet NaN*. A signaling NaN raises an exception flag, which may or may not cause immediate hardware interrupt depending on the FPU configuration, while a quiet NaN just propagates through almost every arithmetic operation, resulting in more NaNs. + +In binary, both NaNs have their exponent bits all set and the mantissa part being anything other than all zeros (to distinguish them from infinities). Note that there are *very* many valid encodings for a NaN. + +## Further Reading + +If you are so inclined, you can read the classic "[What Every Computer Scientist Should Know About Floating-Point Arithmetic](https://www.itu.dk/~sestoft/bachelor/IEEE754_article.pdf)" (1991) and [the paper introducing Grisu3](https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf), the current state-of-the-art for printing floating-point numbers. diff --git a/content/english/hpc/arithmetic/integer.md b/content/english/hpc/arithmetic/integer.md index 8fd1059a..686db686 100644 --- a/content/english/hpc/arithmetic/integer.md +++ b/content/english/hpc/arithmetic/integer.md @@ -3,9 +3,9 @@ title: Integer Numbers weight: 5 --- -If you are reading this chapter sequentially from the beginning, you might be wondering: why would I introduce integer arithmetic after floating-point one? Isn't it supposed to be simpler? +If you are reading this chapter sequentially from the beginning, you might be wondering: why would I introduce integer arithmetic after floating-point one? Isn't it supposed to be easier? -This is true: plain integer representations are simpler. But, counterintuitively, their simplicity allows for more possibilities for operations to be expressed in terms of others. And if floating-point representations are so unwieldy that most of their operations are implemented in hardware, efficiently manipulating integers requires much more creative use of the instruction set. +True: plain integer representations are simpler. But, counterintuitively, their simplicity allows for more possibilities for operations to be expressed in terms of others. And if floating-point representations are so unwieldy that most of their operations are implemented in hardware, efficiently manipulating integers requires much more creative use of the instruction set. ## Binary Formats @@ -19,7 +19,7 @@ $$ \end{aligned} $$ -When the result of an operation can't fit into the word size (e. g. is more or equal to $2^{32}$ for 32-bit unsigned integers), it *overflows* by leaving only the lowest 32 bits of the result. Similarly, if the result is a negative value, it *underflows* by adding it to $2^{32}$, so that it always stays in the $[0, 2^{32})$ range. +When the result of an operation can't fit into the word size (e.g., is more or equal to $2^{32}$ for 32-bit unsigned integers), it *overflows* by leaving only the lowest 32 bits of the result. Similarly, if the result is a negative value, it *underflows* by adding it to $2^{32}$, so that it always stays in the $[0, 2^{32})$ range. This is equivalent to performing all operations modulo a power of two: @@ -31,18 +31,18 @@ $$ \end{aligned} $$ -In either case, it raises a special flag which you can check, but typically when people explicitly use unsigned integers they are expecting this behavior. +In either case, it raises a special flag which you can check, but usually when people explicitly use unsigned integers, they are expecting this behavior. ### Signed Integers -*Signed integers* support storing negative values by dedicating the highest bit to represent the sign of the number, in a similar fashion as floating-point number do. This halves the range of representable non-negative numbers: the maximum possible 32-bit integer is now $(2^{31}-1)$ and not $(2^{32}-1)$. But the encoding of negative values is not quite the same as for floating-point numbers. +*Signed integers* support storing negative values by dedicating the highest bit to represent the sign of the number, in a similar fashion as floating-point numbers do. This halves the range of representable non-negative numbers: the maximum possible 32-bit integer is now $(2^{31}-1)$ and not $(2^{32}-1)$. But the encoding of negative values is not quite the same as for floating-point numbers. -Computer engineers are even lazier than programmers — and this is not only motivated by the instinctive desire of simplification, but also by saving transistor space. This can achieved by reusing circuitry that you already have for other operations, which is what they aimed for when designing the signed integer format: +Computer engineers are even lazier than programmers — and this is not only motivated by the instinctive desire for simplification, but also by saving transistor space. This can be achieved by reusing circuitry that you already have for other operations, which is what they aimed for when designing the signed integer format: -- For a $n$-bit signed integer type, the encodings of all numbers in the $[0, 2^{n-1})$ range remains the same as their unsigned binary representation. +- For an $n$-bit signed integer type, the encodings of all numbers in the $[0, 2^{n-1})$ range remain the same as their unsigned binary representations. - All numbers in the $[-2^{n-1}, 0)$ range are encoded sequentially right after the "positive" range — that is, starting with $(-2^{n - 1})$ that has code $(2^{n-1})$ and ending with $(-1)$ that has code $(2^n - 1)$. -Essentially, all negative numbers are just encoded as if they were subtracted from $2^n$ — an operation known as *two's complement*: +One way to look at this is that all negative numbers are just encoded as if they were subtracted from $2^n$ — an operation known as *two's complement*: $$ \begin{aligned} @@ -51,12 +51,12 @@ $$ \end{aligned} $$ -Here $\bar{x}$ represents bitwise negation, which can be also though of as subtracting $x$ from $(2^n - 1)$. +Here $\bar{x}$ represents bitwise negation, which can be also thought of as subtracting $x$ from $(2^n - 1)$. As an exercise, here are some facts about signed integers: - All positive numbers and zero remain the same as their binary notation. -- All negative numbers have the highest bit set to zero. +- All negative numbers have the highest bit set to one. - There are more negative numbers than positive numbers (exactly by one — because of zero). - For `int`, if you add $1$ to $(2^{31}-1)$, the result will be $-2^{31}$, represented as `10000000` (for exposition purposes, we will only write 8 bits instead of 32). - Knowing a binary notation of a positive number `x`, you can get the binary notation of `-x` as `~x + 1`. @@ -66,36 +66,50 @@ As an exercise, here are some facts about signed integers: The main advantage of this encoding is that you don't have to do anything to convert unsigned integers to signed ones (except maybe check for overflow), and you can reuse the same circuitry for most operations, possibly only flipping the sign bit for comparisons and such. -That said, you need to be carefull with signed integer overflows. Even though they almost always overflow the same way as unsigned integers, programming languages usually consider the possibility of overflow as undefined behavior. If you need to overflow integer variables, convert them to unsigned integers: it's free anyway. +That said, you need to be careful with signed integer overflows. Even though they almost always overflow the same way as unsigned integers, programming languages usually consider the possibility of overflow as undefined behavior. If you need to overflow integer variables, convert them to unsigned integers: it's free anyway. + +**Exercise.** What is the only integer value for which `std::abs` produces a wrong result? What will this result be? ### Integer Types -Integers come in different sizes that all function roughly the same. +Integers come in different sizes, but all function roughly the same. + +| Bits | Bytes | Signed C type | Unsigned C type | Assembly | +|-----:|-------|----------------------|----------------------|----------| +| 8 | 1 | `signed char`[^char] | `unsigned char` | `byte` | +| 16 | 2 | `short` | `unsigned short` | `word` | +| 32 | 4 | `int` | `unsigned int` | `dword` | +| 64 | 8 | `long long` | `unsigned long long` | `qword` | -| Bits | Bytes | Signed C type | Unsigned C type | Assembly | -|-----:|-------|---------------|----------------------|----------| -| 8 | 1 | `signed char` | `char` | `byte` | -| 16 | 2 | `short` | `unsigned short` | `word` | -| 32 | 4 | `int` | `unsigned int` | `dword` | -| 64 | 8 | `long long` | `unsigned long long` | `qword` | +[^char]: Note that `char`, `unsigned char`, and `signed char` are technically three distinct types. The C standard leaves it up to the implementation whether the plain `char` is signed or unsigned (on most compilers, it is signed). -The bits of an integer are simply stored sequentially, and the only ambiguity here is the order in which to store them — left to right or right to left — called *endianness*. Depending on the architecture, the format can be either: +The bits of an integer are simply stored sequentially. The only ambiguity here is the order in which to store them — left to right or right to left — called *endianness*. Depending on the architecture, the format can be either: - *Little-endian*, which lists *lower* bits first. For example, $42_{10}$ will be stored as $010101$. - *Big-endian*, which lists *higher* bits first. All previous examples in this article follow it. -This seems like an important architecture aspect, but actually in most cases it doesn't make a difference: just pick one style and stick with it. But in some cases it does: +This seems like an important architecture aspect, but in most cases, it doesn't make a difference: just pick one style and stick with it. But in some cases it does: -- Little-endian has the advantage that you can cast a value to a smaller type (e. g. `long long` to `int`) by just loading fewer bytes, which in most cases means doing nothing — thanks to *register aliasing*, `eax` refers to the first 4 bytes of `rax`, so conversion is essentially free. It is also easier to read values in a variety of type sizes — while on big-endian architectures, loading a `int` from a `long long` array would require shifting the pointer by 2 bytes. +- Little-endian has the advantage that you can cast a value to a smaller type (e.g., `long long` to `int`) by just loading fewer bytes, which in most cases means doing nothing — thanks to *register aliasing*, `eax` refers to the first 4 bytes of `rax`, so conversion is essentially free. It is also easier to read values in a variety of type sizes — while on big-endian architectures, loading an `int` from a `long long` array would require shifting the pointer by 2 bytes. - Big-endian has the advantage that higher bytes are loaded first, which in theory can make highest-to-lowest routines such as comparisons and printing faster. You can also perform certain checks such as finding out whether a number is negative by only loading its first byte. -Big-endian is also more "natural" — this is how we write binary numbers on paper — but the advantage of having faster type conversions outweigh it. For this reason, little-endian is used by default on most hardware, although some CPUs are "bi-endian" and can be configured to switch modes on demand. +Big-endian is also more "natural" — this is how we write binary numbers on paper — but the advantage of having faster type conversions outweights it. For this reason, little-endian is used by default on most hardware, although some CPUs are "bi-endian" and can be configured to switch modes on demand. ### 128-bit Integers -Sometimes we need to multiply two 64-bit integers to get a 128-bit integer — that usually serves as a temporary value and e. g. reduced by modulo right away. +Sometimes we need to multiply two 64-bit integers to get a 128-bit integer — usually to serve as a temporary value and be reduced modulo a 64-bit integer right away. -There are no 128-bit registers to hold the result of such multiplication, but `mul` instruction can operate in a manner [similar to division](/hpc/analyzing-performance/gcd/), by multiplying whatever is stored in `rax` by its operand and [writing the result](https://gcc.godbolt.org/z/4Gfxhs84Y) into two registers — the lower 64 bits of the result will go into `rdx`, and `rax` will have the higher 64 bits. Some languages have a special type to support such an operation: +There are no 128-bit registers to hold the result of such multiplication, so the `mul` instruction, in addition to the normal `mul r r` form where it multiplies the values in registers and keeps the lower half of the result, has another `mul r` mode, where it multiplies whatever is stored in the `rax` register by its operand, and writes the result into two registers — the lower 64 bits of the result will go into `rax`, and the higher 64 bits go into `rdx`: + +```nasm +; input: 64-bit integers a and b, stored in rsi and rdi +; output: 128-bit product a * b, stored in rax (lower 64-bit) and rdx (higher 64-bit) +mov rax, rdi +mov r8, rdx +imul rsi +``` + +Some compilers have a separate type supporting this operation. In GCC and Clang it is available as `__int128`: ```cpp void prod(int64_t a, int64_t b, __int128 *c) { @@ -103,7 +117,7 @@ void prod(int64_t a, int64_t b, __int128 *c) { } ``` -For all purposes other than multiplication, 128-bit integers are just bundled as two registers. This makes it too weird to have a full-fledged 128-bit type, so the support for it is limited. The typical use for this type is to get either the lower or the higher part of the multiplication and forget about it: +Its typical use case is to immediately extract either the lower or the higher part of the multiplication and forget about it: ```c++ __int128_t x = 1; @@ -111,4 +125,23 @@ int64_t hi = x >> 64; int64_t lo = (int64_t) x; // will be just truncated ``` -Other platforms provide similar mechanisms for dealing with longer-than-word multiplication. For example, arm has `mulhi` and `mullo` instruction, returning lower and higher parts of the multiplication, and x86 SIMD extensions have similar 32-bit instructions. +For all purposes other than multiplication, 128-bit integers are just bundled as two registers. This makes it too weird to have a full-fledged 128-bit type, so the support for it is limited, other than for basic arithmetic operations. For example: + +```c++ +__int128_t add(__int128_t a, __int128_t b) { + return a + b; +} +``` + +is compiled into: + +```nasm +add: + mov rax, rdi + add rax, rdx ; this sets the carry flag in case of an overflow + adc rsi, rcx ; +1 if the carry flag is set + mov rdx, rsi + ret +``` + +Other platforms provide similar mechanisms for dealing with longer-than-word multiplication. For example, Arm has `mulhi` and `mullo` instructions, returning lower and higher parts of the multiplication, and x86 [SIMD extensions](/hpc/simd) have similar 32-bit instructions. diff --git a/content/english/hpc/arithmetic/newton.md b/content/english/hpc/arithmetic/newton.md index ee92f98a..510312aa 100644 --- a/content/english/hpc/arithmetic/newton.md +++ b/content/english/hpc/arithmetic/newton.md @@ -3,9 +3,9 @@ title: Newton's Method weight: 3 --- -Reaching the maximum possible precision is very rarely required from a practical algorithm. In real-world data, modeling and measurement errors are usually a few orders of magnitude larger than the errors that come from rounding floating-point numbers and such, so we are often perfectly happy with picking an approximate method that trades off precision for speed. +Reaching the maximum possible precision is rarely required from a practical algorithm. In real-world data, modeling and measurement errors are usually several orders of magnitude larger than the errors that come from rounding floating-point numbers and such, and we are often perfectly happy with picking an approximate method that trades off precision for speed. -In this section, we will go through some classic numerical methods, just to get the gist of it. +In this section, we introduce one of the most important building blocks in such approximate, numerical algorithms: *Newton's method*. ## Newton's Method @@ -15,9 +15,9 @@ $$ f(x) = 0 $$ -The only thing assumed about the function $f$ is that at least a one root exists and that $f(x)$ is continuous and differentiable on the search interval. +The only thing assumed about the function $f$ is that at least one root exists and that $f(x)$ is continuous and differentiable on the search interval. There are also some [boring corner cases](https://en.wikipedia.org/wiki/Newton%27s_method#Failure_analysis), but they almost never occur in practice, so we will just informally say that the function is "good." -The main idea of the algorithm is to start with some initial approximation $x_0$ and then iteratively improve it by drawing the tangent to the graph of the function at $x = x_i$ and setting the next approximation $x_{i+1}$ equal to the $x$-coordinate of its intersection with the $x$-axis. The intuition is that if the function $f$ is "[good](https://en.wikipedia.org/wiki/Smoothness)", and $x_i$ is already close enough to the root, then $x_{i+1}$ will be even closer. +The main idea of the algorithm is to start with some initial approximation $x_0$ and then iteratively improve it by drawing the tangent to the graph of the function at $x = x_i$ and setting the next approximation $x_{i+1}$ equal to the $x$-coordinate of its intersection with the $x$-axis. The intuition is that if the function $f$ is "[good](https://en.wikipedia.org/wiki/Smoothness)" and $x_i$ is already close enough to the root, then $x_{i+1}$ will be even closer. ![](../img/newton.png) @@ -33,7 +33,7 @@ $$ x_{i+1} = x_i - \frac{f(x_i)}{f'(x_i)} $$ -Newton's method is very important: it is the basis of most optimization solvers in science and engineering. +Newton's method is very important: it is the basis of a wide range of optimization solvers in science and engineering. ### Square Root @@ -62,15 +62,15 @@ double sqrt(double n) { } ``` -The algorithm converges for many functions, although it does so reliably and provably only for a certain subset of them (e. g. convex functions). Another question is how fast the convergence is, if it occurs. +The algorithm converges for many functions, although it does so reliably and provably only for a certain subset of them (e.g., convex functions). Another question is how fast the convergence is, if it occurs. ### Rate of Convergence Let's run a few iterations of Newton's method to find the square root of $2$, starting with $x_0 = 1$, and check how many digits it got correct after each iteration: -
-1
-1.5
+
+1.0000000000000000000000000000000000000000000000000000000000000
+1.5000000000000000000000000000000000000000000000000000000000000
 1.4166666666666666666666666666666666666666666666666666666666675
 1.4142156862745098039215686274509803921568627450980392156862745
 1.4142135623746899106262955788901349101165596221157440445849057
@@ -104,7 +104,7 @@ $$
 which means that the error roughly squares (and halves) on each iteration once we are close to the solution. Since the logarithm $(- \log_{10} \delta_i)$ is roughly the number of accurate significant digits in the answer $x_i$, squaring the relative error corresponds precisely to doubling the number of significant
 digits that we had observed.
 
-This is known as *quadratic convergence*, and in fact this is not limited to finding square roots. With detailed proof being left as an exercise to the reader, it can be shown that, in general
+This is known as *quadratic convergence*, and in fact, this is not limited to finding square roots. With detailed proof being left as an exercise to the reader, it can be shown that, in general
 
 $$
 |\delta_{i+1}| = \frac{|f''(x_i)|}{2 \cdot |f'(x_n)|} \cdot \delta_i^2
@@ -114,4 +114,4 @@ which results in at least quadratic convergence under a few additional assumptio
 
 ## Further Reading
 
-[Introduction to numerical methods at MIT](https://ocw.mit.edu/courses/mathematics/18-330-introduction-to-numerical-analysis-spring-2012/lecture-notes/MIT18_330S12_Chapter4.pdf)
+[Introduction to numerical methods at MIT](https://ocw.mit.edu/courses/mathematics/18-330-introduction-to-numerical-analysis-spring-2012/lecture-notes/MIT18_330S12_Chapter4.pdf).
diff --git a/content/english/hpc/arithmetic/inverse-sqrt.md b/content/english/hpc/arithmetic/rsqrt.md
similarity index 53%
rename from content/english/hpc/arithmetic/inverse-sqrt.md
rename to content/english/hpc/arithmetic/rsqrt.md
index 8687341f..f1529d42 100644
--- a/content/english/hpc/arithmetic/inverse-sqrt.md
+++ b/content/english/hpc/arithmetic/rsqrt.md
@@ -3,19 +3,19 @@ title: Fast Inverse Square Root
 weight: 4
 ---
 
-The inverse square root of a floating-point number $\frac{1}{\sqrt x}$ is used in calculating normalized vectors, which are in turn extensively used in various simulation scenarios such as computer graphics, e. g. to determine angles of incidence and reflection to simulate lighting.
+The inverse square root of a floating-point number $\frac{1}{\sqrt x}$ is used in calculating normalized vectors, which are in turn extensively used in various simulation scenarios such as computer graphics (e.g., to determine angles of incidence and reflection to simulate lighting).
 
 $$
 \hat{v} = \frac{\vec v}{\sqrt {v_x^2 + v_y^2 + v_z^2}}
 $$
 
-Calculating inverse square root directly — by first calculating square root and then dividing by it — is extremely slow, because both of these operations are slow even though they are implemented in hardware.
+Calculating an inverse square root directly — by first calculating a square root and then dividing $1$ by it — is extremely slow because both of these operations are slow even though they are implemented in hardware.
 
 But there is a surprisingly good approximation algorithm that takes advantage of the way floating-point numbers are stored in memory. In fact, it is so good that it has been [implemented in hardware](https://www.felixcloutier.com/x86/rsqrtps), so the algorithm is no longer relevant by itself for software engineers, but we are nonetheless going to walk through it for its intrinsic beauty and great educational value.
 
-Apart from the method itself, quite interesting is the history of its creation. It is attributed to a game studio *id Software* that used it in their iconic 1999 game *Quake III Arena*, although apparently it got there by a chain of "I learned it from a guy who learned it from a guy" that seems to end on William Kahan (the same one that is responsible for IEEE 754 and Kahan summation algorithm).
+Apart from the method itself, quite interesting is the history of its creation. It is attributed to a game studio *id Software* that used it in their iconic 1999 game *Quake III Arena*, although apparently, it got there by a chain of "I learned it from a guy who learned it from a guy" that seems to end on William Kahan (the same one that is responsible for IEEE 754 and Kahan summation algorithm).
 
-It became popular in game developing community around 2005, when they released the source code of the game. Here is [the relevant excerpt from it](https://github.com/id-Software/Quake-III-Arena/blob/master/code/game/q_math.c#L552), including the comments:
+It became popular in game developing community around 2005 when they released the source code of the game. Here is [the relevant excerpt from it](https://github.com/id-Software/Quake-III-Arena/blob/master/code/game/q_math.c#L552), including the comments:
 
 ```c++
 float Q_rsqrt(float number) {
@@ -35,9 +35,9 @@ float Q_rsqrt(float number) {
 }
 ```
 
-We will go through what it does step by step, but first we need to take a small detour.
+We will go through what it does step by step, but first, we need to take a small detour.
 
-### Calculating Approximate Logarithm
+### Approximate Logarithm
 
 Before computers (or at least affordable calculators) became an everyday thing, people computed multiplication and related operations using logarithm tables — by looking up the logarithms of $a$ and $b$, adding them, and then finding the inverse logarithm of the result.
 
@@ -51,9 +51,9 @@ $$
 \log \frac{1}{\sqrt x} = - \frac{1}{2} \log x
 $$
 
-The fast inverse square root is based on this identity, and so it needs to calculate the logarithm of $x$ very quickly. Turns out, it can be approximated by just reinterpreting a 32-bit `float` as integer.
+The fast inverse square root is based on this identity, and so it needs to calculate the logarithm of $x$ very quickly. Turns out, it can be approximated by just reinterpreting a 32-bit `float` as an integer.
 
-[Recall](../float), floating-point numbers sequentially store the sign bit (equal to zero for positive values, which is our case), exponent $e_x$ and mantissa $m_x$, which corresponds to
+[Recall](../float) that floating-point numbers sequentially store the sign bit (equal to zero for positive values, which is our case), exponent $e_x$ and mantissa $m_x$, which corresponds to
 
 $$
 x = 2^{e_x} \cdot (1 + m_x)
@@ -65,31 +65,33 @@ $$
 \log_2 x = e_x + \log_2 (1 + m_x)
 $$
 
-Since $m_x \in [0, 1)$, the logarithm on the right hand side can be approximated by
+Since $m_x \in [0, 1)$, the logarithm on the right-hand side can be approximated by
 
 $$
 \log_2 (1 + m_x) \approx m_x
 $$
 
-The approximation is exact at both ends of the intervals, but to account for average case we need to shift it by a small constant $\sigma$, therefore
+The approximation is exact at both ends of the intervals, but to account for the average case we need to shift it by a small constant $\sigma$, therefore
 
 $$
 \log_2 x = e_x + \log_2 (1 + m_x) \approx e_x + m_x + \sigma
 $$
 
-Now, having this approximation in mind and defining $L=23$ as the number of mantissa bits in a `float` and $B=127$ for the exponent bias, when we reinterpret the bit-pattern of $x$ as an integer $I_x$, we get
+Now, having this approximation in mind and defining $L=2^{23}$ (the number of mantissa bits in a `float`) and $B=127$ (the exponent bias), when we reinterpret the bit-pattern of $x$ as an integer $I_x$, we essentially get
 
 $$
 \begin{aligned}
-I_x &= L(e_x + B + m_x)
-\\  &= L(e_x + m_x + \sigma +B-\sigma )
-\\  &\approx L\log_2 (x) + L (B-\sigma )
+I_x &= L \cdot (e_x + B + m_x)
+\\  &= L \cdot (e_x + m_x + \sigma +B-\sigma )
+\\  &\approx L \cdot \log_2 (x) + L \cdot (B-\sigma )
 \end{aligned}
 $$
 
-When you tune $\sigma$ to minimize them mean square error, this results in a surprisingly accurate approximation.
+(Multiplying an integer by $L=2^{23}$ is equivalent to left-shifting it by 23.)
 
-![](../img/approx.svg)
+When you tune $\sigma$ to minimize the mean square error, this results in a surprisingly accurate approximation.
+
+![Reinterpreting a floating-point number $x$ as an integer (blue) compared to its scaled and shifted logarithm (gray)](../img/approx.svg)
 
 Now, expressing the logarithm from the approximation, we get
 
@@ -99,7 +101,7 @@ $$
 
 Cool. Now, where were we? Oh, yes, we wanted to calculate the inverse square root.
 
-### Approximating Result
+### Approximating the Result
 
 To calculate $y = \frac{1}{\sqrt x}$ using the identity $\log_2 y = - \frac{1}{2} \log_2 x$, we can plug it into our approximation formula and get
 
@@ -115,32 +117,32 @@ $$
 I_y \approx \frac{3}{2} L (B - \sigma) - \frac{1}{2} I_x
 $$
 
-It turns out, we don't even need to calculate logarithm in the first place: the formula above is just a constant minus the half of integer reinterpretation of $x$. It is written in the code as:
+It turns out, we don't even need to calculate the logarithm in the first place: the formula above is just a constant minus half the integer reinterpretation of $x$. It is written in the code as:
 
 ```cpp
 i = * ( long * ) &y;
 i = 0x5f3759df - ( i >> 1 );
 ```
 
-We reinterpret `y` as an integer on the first line, and then plug into in to the formula, the first term of which is the magic number $\frac{3}{2} L (B - \sigma) = \mathtt{0x5F3759DF}$, while the second is calculated with a binary shift instead of division.
+We reinterpret `y` as an integer in the first line, and then it plug into the formula on the second, the first term of which is the magic number $\frac{3}{2} L (B - \sigma) = \mathtt{0x5F3759DF}$, while the second is calculated with a binary shift instead of division.
 
 ### Iterating with Newton's Method
 
-What we have next is a couple hand-coded iterations of Newton's method with $f(y) = \frac{1}{y^2} - x$ and a very good initial value. It's update rule is
+What we have next is a couple hand-coded iterations of Newton's method with $f(y) = \frac{1}{y^2} - x$ and a very good initial value. Its update rule is
 
 $$
 f'(y) = - \frac{2}{y^3} \implies y_{i+1} = y_{i} (\frac{3}{2} - \frac{x}{2} y_i^2) = \frac{y_i (3 - x y_i^2)}{2}
 $$
 
-which is written in code as
+which is written in the code as
 
 ```cpp
 x2 = number * 0.5F;
 y  = y * ( threehalfs - ( x2 * y * y ) );
 ```
 
-The initial approximation is so good that just one iteration was enough for game development purposes. It falls within 99.8% of the correct answer after just the first iteration, and can be reiterated further to improve accuracy — which is what is done in the hardware: the x86 does a few of them and guarantees a relative error of no more than $1.5 \times 2^{-12}$.
+The initial approximation is so good that just one iteration was enough for game development purposes. It falls within 99.8% of the correct answer after just the first iteration and can be reiterated further to improve accuracy — which is what is done in the hardware: [the x86 instruction](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=3037,3009,5135,4870,4870,4872,4875,833,879,874,849,848,6715,4845,6046,3853,288,6570,6527,6527,90,7307,6385,5993&text=rsqrt&techs=AVX,AVX2) does a few of them and guarantees a relative error of no more than $1.5 \times 2^{-12}$.
 
-## Further Reading
+### Further Reading
 
-[Wikipedia article of fast inverse square root](https://en.wikipedia.org/wiki/Fast_inverse_square_root#Floating-point_representation)
+[Wikipedia article on fast inverse square root](https://en.wikipedia.org/wiki/Fast_inverse_square_root#Floating-point_representation).
diff --git a/content/english/hpc/compilation/_index.md b/content/english/hpc/compilation/_index.md
index cbc0f691..e32ba624 100644
--- a/content/english/hpc/compilation/_index.md
+++ b/content/english/hpc/compilation/_index.md
@@ -6,6 +6,6 @@ weight: 4
 
 The main benefit of [learning assembly language](../architecture/assembly) is not the ability to write programs in it, but the understanding of what is happening during the execution of compiled code and its performance implications.
 
-There are rare cases where we *really* need to switch to handwritten assembly for maximal performance, but most of the time compilers are capable of producing near-optimal code all by themselves. When they do not, it is usually because the programmer knows more about the problem than what can be inferred from the source code, but failed to communicate this extra information to the compiler.
+There are rare cases where we *really* need to switch to handwritten assembly for maximal performance, but most of the time compilers are capable of producing near-optimal code all by themselves. When they do not, it is usually because the programmer knows more about the problem than what can be inferred from the source code but failed to communicate this extra information to the compiler.
 
-In this chapter, we will discuss the intricacies of getting compiler to do exactly what we want and gathering useful information that can guide further optimizations.
+In this chapter, we will discuss the intricacies of getting the compiler to do exactly what we want and gathering useful information that can guide further optimizations.
diff --git a/content/english/hpc/compilation/abstractions.md b/content/english/hpc/compilation/abstractions.md
index a11026b7..004b809b 100644
--- a/content/english/hpc/compilation/abstractions.md
+++ b/content/english/hpc/compilation/abstractions.md
@@ -27,3 +27,62 @@ Usually it isn't that hard to rewrite a small program so that it is more straigh
 Object-oriented and especially functional languages have some very hard-to-pierce abstractions like these. For this reason, people often prefer to write performance critical software (interpreters, runtimes, databases) in a style closer to C rather than higher-level languages.
 
 Thick-bearded C/assembly programmers.
+
+### Memory
+
+Pointer chasing.
+
+```c++
+typedef vector< vector > matrix;
+matrix a(n, vector(n, 0));
+
+int val = a[i][j];
+```
+
+This is up tow twice as slow: you first need to fetch 
+
+```c++
+int a = new int[n * n];
+memset(a, 0, 4 * n* n);
+
+int val = a[i * n + j];
+```
+
+You can write a wrapper is you really want an abstraction:
+
+```c++
+template
+struct Matrix {
+    int x, y, n, N;
+    T* data;
+    T* operator[](int i) { return data + (x + i) * N + y; }
+};
+```
+
+For example, the [cache-oblivious transposition](/hpc/external-memory/oblivious) would go like this:
+
+```c++
+Matrix subset(int _x, int _y, int _n) { return {_n, _x, _y, N, data}; }
+
+Matrix transpose() {
+    if (n <= 32) {
+        for (int i = 0; i < n; i++)
+            for (int j = 0; j < i; j++)
+                swap((*this)[j][i], (*this)[i][j]);
+    } else {
+        auto A = subset(x, y, n / 2).transpose();
+        auto B = subset(x + n / 2, y, n / 2).transpose();
+        auto C = subset(x, y + n / 2, n / 2).transpose();
+        auto D = subset(x + n / 2, y + n / 2, n / 2).transpose();
+        for (int i = 0; i < n; i++)
+            for (int j = 0; j < n; j++)
+                swap(B[i][j], C[i][j]);
+    }
+
+    return *this;
+}
+```
+
+I personally prefer to write low-level code, because it is easier to optimize.
+
+It is cleaner? Don't think so.
diff --git a/content/english/hpc/compilation/contracts.md b/content/english/hpc/compilation/contracts.md
index 796a6702..56a50d6b 100644
--- a/content/english/hpc/compilation/contracts.md
+++ b/content/english/hpc/compilation/contracts.md
@@ -3,11 +3,9 @@ title: Contract Programming
 weight: 6
 ---
 
-In "safe" languages like Java and Rust, you normally have well-defined behavior for every possible operation and every possible input. There are some things that are *under-defined*, like the order of keys in a hash table, but these are usually some minor details left to implementation for potential performance gains in the future.
+In "safe" languages like Java and Rust, you normally have well-defined behavior for every possible operation and every possible input. There are some things that are *under-defined*, like the order of keys in a hash table or the growth factor of an `std::vector`, but these are usually some minor details that are left up to implementation for potential performance gains in the future.
 
-In contrast, C and C++ take the concept of undefined behavior to another level. Certain operations don't cause an error during compilation or runtime but are just not *allowed* — in the sense of there being a *contract* between the programmer and the compiler, that in case of undefined behavior the compiler can do literally anything, including formatting your hard drive.
-
-But compiler engineers are not interested in formatting your hard drive of blowing up your monitor. Instead, undefined behavior is used to guarantee a lack of corner cases and help optimization.
+In contrast, C and C++ take the concept of undefined behavior to another level. Certain operations don't cause an error during compilation or runtime but are just not *allowed* — in the sense of there being a *contract* between the programmer and the compiler, that in case of undefined behavior, the compiler is legally allowed to do literally anything, including blowing up your monitor or formatting your hard drive. But compiler engineers are not interested in doing that. Instead, undefined behavior is used to guarantee a lack of corner cases and help optimization.
 
 ### Why Undefined Behavior Exists
 
@@ -15,17 +13,15 @@ There are two major groups of actions that cause undefined behavior:
 
 - Operations that are almost certainly unintentional bugs, like dividing by zero, dereferencing a null pointer, or reading from uninitialized memory. You want to catch these as soon as possible during testing, so crashing or having some non-deterministic behavior is better than having them always do a fixed fallback action such as returning zero.
 
-  You can compile and run a program with *sanitizers* to catch undefined behavior early. In GCC and Clang, you can use the `-fsanitize=undefined` flag, and some operations that frequently cause UB will be instrumented to detect it at runtime.
-
-- Operations that have slightly different observable behavior on different platforms. For example, the result of left-shifting an integer by more than 31 bits is undefined, because the relevant instructions are implemented differently on Arm and x86 CPUs. If you standardize one specific behavior, then all programs compiled for the other platform will have to spend a few more cycles checking for that edge case, so it is best to leave it either undefined.
+  You can compile and run a program with *sanitizers* to catch undefined behavior early. In GCC and Clang, you can use the `-fsanitize=undefined` flag, and some operations that are notorious for causing UB will be instrumented to detect it at runtime.
 
-  Sometimes, when there is a legitimate use case for some platform-specific behavior, it can be left *implementation-defined* instead of being undefined. For example, the result of right-shifting a [negative integer](/hpc/arithmetic/integer) depends on the platform: it either shifts in zeros or ones (e. g. right shifting `11010110 = -42` by one may mean either `01101011 = 107` or `11101011 = -21`, both use cases being realistic).
+- Operations that have slightly different observable behavior on different platforms. For example, the result of left-shifting an integer by more than 31 bits is undefined, because the instruction that does it is implemented differently on Arm and x86 CPUs. If you standardize one specific behavior, then all programs compiled for the other platform will have to spend a few more cycles checking for that edge case, so it is best to leave it undefined.
 
-Designating something as undefined instead of implementation-defined behavior also helps compilers in optimization.
+  Sometimes, when there is a legitimate use case for some platform-specific behavior, instead of declaring it undefined, it can be left *implementation-defined*. For example, the result of right-shifting a [negative integer](/hpc/arithmetic/integer) depends on the platform: it either shifts in zeros or ones (e.g., right-shifting `11010110 = -42` by one may mean either `01101011 = 107` or `11101011 = -21`, both use cases being realistic).
 
-Consider the case of signed integer overflow. On almost all architectures, [signed integers](/hpc/arithmetic/integer) overflow the same way as unsigned ones, with `INT_MAX + 1 == INT_MIN`, but yet this is undefined behavior in the C/C++ standard. This is very much intentional: if you disallow signed integer overflow, then `(x + 1) > x` is guaranteed to be always true for `int`, but not for `unsigned int`, because `(x + 1)` may overflow. For signed types, this lets compilers optimize such checks away.
+Designating something as undefined instead of implementation-defined behavior also helps compilers in optimization. Consider the case of signed integer overflow. On almost all architectures, [signed integers](/hpc/arithmetic/integer) overflow the same way as unsigned ones, with `INT_MAX + 1 == INT_MIN`, and yet, this is undefined behavior according to the C++ standard. This is very much intentional: if you disallow signed integer overflow, then `(x + 1) > x` is guaranteed to be always true for `int`, but not for `unsigned int`, because `(x + 1)` may overflow. For signed types, this lets compilers optimize such checks away.
 
-As a more naturally occurring example, consider the case of a loop with an integer control variable. Modern C++ and languages like Rust are advocating for using an unsigned integer (`size_t` / `usize`), while C programmers stubbornly keep using `int`. To understand why, consider the following `for` loop:
+As a more naturally occurring example, consider the case of a loop with an integer control variable. Modern C++ and languages like Rust encourage programmers to use an unsigned integer (`size_t` / `usize`), while C programmers stubbornly keep using `int`. To understand why, consider the following `for` loop:
 
 ```cpp
 for (unsigned int i = 0; i < n; i++) {
@@ -33,7 +29,7 @@ for (unsigned int i = 0; i < n; i++) {
 }
 ```
 
-How many times does this loop execute? There are technically two valid answers: $n$ and infinity, the second being the case if $n$ exceeds $2^{32}$ so that $i$ keeps resetting to zero every $2^{32}$ iterations. While the former is probably the one assumed by the programmer, to comply with the language spec, the compiler still has to insert additional runtime checks and consider the two cases, which should be optimized differently. Meanwhile, the `int` version would make exactly $n$ iterations, because the very possibility of a signed overflow is defined out of existence.
+How many times does this loop execute? There are technically two valid answers: $n$ and infinity, the second being the case if $n$ exceeds $2^{32}$ so that $i$ keeps resetting to zero every $2^{32}$ iterations. While the former is probably the one assumed by the programmer, to comply with the language spec, the compiler still has to insert additional runtime checks and consider the two cases, which should be optimized differently. Meanwhile, the `int` version would make exactly $n$ iterations because the very possibility of a signed overflow is defined out of existence.
 
 ### Removing Corner Cases
 
@@ -49,13 +45,13 @@ T at(size_t k) {
 }
 ```
 
-Interestingly, these checks are rarely actually executed during runtime, because the compiler can often prove, during compilation time, that each access will be within bounds. For example, when iterating in a `for` loop from 1 to the array size and indexing $i$-th element on each step, nothing illegal can possibly happen, so the bounds checks can be safely optimized away.
+Interestingly, these checks are rarely actually executed during runtime because the compiler can often prove — during compile time — that each access will be within bounds. For example, when iterating in a `for` loop from 1 to the array size and indexing $i$-th element on each step, nothing illegal can possibly happen, so the bounds checks can be safely optimized away.
 
 ### Assumptions
 
-When the compiler can't prove the inexistence of corner cases, but you can, this additional information can be provided using the mechanism of undefined behavior.
+When the compiler can't prove the inexistence of corner cases, but *you* can, this additional information can be provided using the mechanism of undefined behavior.
 
-Clang has a helpful `__builtin_assume` function where you can put a statement that is guaranteed to be true, and the compiler will use this assumption in optimization. In GCC you can do the same with `__builtin_unreachable`:
+Clang has a helpful `__builtin_assume` function where you can put a statement that is guaranteed to be true, and the compiler will use this assumption in optimization. In GCC, you can do the same with `__builtin_unreachable`:
 
 ```cpp
 void assume(bool pred) {
@@ -64,9 +60,9 @@ void assume(bool pred) {
 }
 ```
 
-For instance, you can put `assume(k < vector.size())` before `at` in the example above, and then the bounds check should be optimized away.
+For instance, you can put `assume(k < vector.size())` before `at` in the example above, and then the bounds check will be optimized away.
 
-It is also quite useful to combine `assume` with `assert` and `static_assert` to find bugs: you can use the same function to check preconditions in the debug build, and then use them to improve performance in the production build.
+It is also quite useful to combine `assume` with `assert` and `static_assert` to find bugs: you can use the same function to check preconditions in the debug build and then use them to improve performance in the production build.
 
 
+
+For integer arithmetic, this is different because the results always have to be exact. Consider the case of division by 2:
 
 ```cpp
 unsigned div_unsigned(unsigned x) {
@@ -103,7 +105,7 @@ A widely known optimization is to replace it with a single right shift (`x >> 1`
 shr eax
 ```
 
-This is certainly correct for all *positive* numbers. But what about the general case?
+This is certainly correct for all *positive* numbers, but what about the general case?
 
 ```cpp
 int div_signed(int x) {
@@ -114,7 +116,9 @@ int div_signed(int x) {
 If `x` is negative, then simply shifting doesn't work — regardless of whether shifting is done in zeros or sign bits:
 
 - If we shift in zeros, we get a non-negative result (the sign bit is zero).
-- If we shift in sign bits, then rounding will happen towards negative infinity instead of zero (`-5 / 2` will be equal to `-3` instead of `-2`).
+- If we shift in sign bits, then rounding will happen towards negative infinity instead of zero (`-5 / 2` will be equal to `-3` instead of `-2`)[^python].
+
+[^python]: Fun fact: in Python, integer-dividing a negative number for some reason floors the result, so that `-5 // 2 = -3` and equivalent to `-5 >> 1 = -3`. I doubt that Guido van Rossum had this optimization in mind when initially designing the language, but, theoretically, a [JIT-compiled](/hpc/complexity/languages/#compiled-languages) Python program with many divisions by two may be faster than an analogous C++ program.
 
 So, for the general case, we have to insert some crutches to make it work:
 
@@ -125,7 +129,7 @@ add  eax, ebx   ; add 1 to the value if it is negative to ensure rounding toward
 sar  eax        ; this one shifts in sign bits
 ```
 
-But the positive case is clearly what was intended. Here we can also use the `assume` mechanism to exclude that corner case:
+When only the positive case is what was intended, we can also use the `assume` mechanism to eliminate the possibility of negative `x` and avoid handling this corner case:
 
 ```cpp
 int div_assume(int x) {
@@ -134,6 +138,8 @@ int div_assume(int x) {
 }
 ```
 
+Although in this particular case, perhaps the best syntax to express that we only expect non-negative numbers is to use an unsigned integer type.
+
 Because of nuances like this, it is often beneficial to expand the algebra in intermediate functions and manually simplify arithmetic yourself rather than relying on the compiler to do it.
 
 ### Memory Aliasing
@@ -151,9 +157,9 @@ void add(int *a, int *b, int n) {
 
 Since each iteration of this loop is independent, it can be executed in parallel and [vectorized](/hpc/simd). But is it, technically?
 
-There may be a problem if the arrays `a` and `b` intersect. Consider the case when `b == a + 1`, that is, if `b` is a just a memory view of `a` starting from the second element. In this case, the next iteration depends on the previous one, and the only correct solution is execute the loop sequentially. The compiler has to check for such possibilities, even if the programmer knows they can't happen.
+There may be a problem if the arrays `a` and `b` intersect. Consider the case when `b == a + 1`, that is, if `b` is just a memory view of `a` starting from its second element. In this case, the next iteration depends on the previous one, and the only correct solution is to execute the loop sequentially. The compiler has to check for such possibilities even if the programmer knows they can't happen.
 
-This is why we have `const` and `restrict` keywords. The first one enforces that that we won't modify memory with the pointer variable, and the second is a way to tell compiler that the memory is guaranteed to be not aliased.
+This is why we have `const` and `restrict` keywords. The first one enforces that we won't modify memory with the pointer variable, and the second is a way to tell the compiler that the memory is guaranteed to not be aliased.
 
 ```cpp
 void add(int * __restrict__ a, const int * __restrict__ b, int n) {
@@ -164,11 +170,11 @@ void add(int * __restrict__ a, const int * __restrict__ b, int n) {
 
 These keywords are also a good idea to use by themselves for the purpose of self-documenting.
 
-### C++20 Contracts
+### C++ Contracts
 
-Contract programming is an underused, but very powerful technique.
+Contract programming is an underused but very powerful technique.
 
-Design-by-contract actually made it into the C++20 standard in the form of [contract sattributes](http://www.hellenico.gr/cpp/w/cpp/language/attributes/contract.html), which are functionally equivalent to our hand-made, compiler-specific `assume`:
+There is a late-stage proposal to add design-by-contract into the C++ standard in the form of [contract attributes](http://www.hellenico.gr/cpp/w/cpp/language/attributes/contract.html), which are functionally equivalent to our hand-made, compiler-specific `assume`:
 
 ```c++
 T at(size_t k) [[ expects: k < n ]] {
@@ -178,7 +184,7 @@ T at(size_t k) [[ expects: k < n ]] {
 
 There are 3 types of attributes — `expects`, `ensures`, and `assert` — respectively used for specifying pre- and post-conditions in functions and general assertions that can be put anywhere in the program.
 
-Unfortunately, this exciting new feature is not yet implemented in any major C++ compiler, but maybe around 2022-2023 we will be able to write code like this:
+Unfortunately, this exciting new feature is [not yet finally standardized](https://www.reddit.com/r/cpp/comments/cmk7ek/what_happened_to_c20_contracts/), let alone implemented in a major C++ compiler. But maybe, in a few years, we will be able to write code like this:
 
 ```c++
 bool is_power_of_two(int m) {
@@ -190,7 +196,7 @@ int mod_power_of_two(int x, int m)
     [[ expects: is_power_of_two(m) ]]
     [[ ensures r: r >= 0 && r < m ]]
 {
-    float r = x & (m - 1);
+    int r = x & (m - 1);
     [[ assert: r = x % m ]];
     return r;
 }
@@ -198,4 +204,4 @@ int mod_power_of_two(int x, int m)
 
 Some forms of contract programming are also available in other performance-oriented languages such as [Rust](https://docs.rs/contracts/latest/contracts/) and [D](https://dlang.org/spec/contracts.html).
 
-A general and language-agnostic advice is to always inspect the assembly that the compiler produced, and if it is not what you were hoping for, try to think about corner cases that may be limiting the compiler from optimizing it.
+A general and language-agnostic advice is to always [inspect the assembly](../stages) that the compiler produced, and if it is not what you were hoping for, try to think about corner cases that may be limiting the compiler from optimizing it.
diff --git a/content/english/hpc/compilation/flags.md b/content/english/hpc/compilation/flags.md
index 08e83341..74383237 100644
--- a/content/english/hpc/compilation/flags.md
+++ b/content/english/hpc/compilation/flags.md
@@ -1,6 +1,7 @@
 ---
 title: Flags and Targets
 weight: 2
+published: true
 ---
 
 The first step of getting high performance from the compiler is to ask for it, which is done with over a hundred different compiler options, attributes, and pragmas.
@@ -11,9 +12,9 @@ There are 4 *and a half* main levels of optimization for speed in GCC:
 
 - `-O0` is the default one that does no optimizations (although, in a sense, it does optimize: for compilation time).
 - `-O1` (also aliased as `-O`) does a few "low-hanging fruit" optimizations, almost not affecting the compilation time.
-- `-O2` enables all optimizations that are known to have little to no negative side effects and take reasonable time to complete (this is what most projects use for production builds).
+- `-O2` enables all optimizations that are known to have little to no negative side effects and take a reasonable time to complete (this is what most projects use for production builds).
 - `-O3` does very aggressive optimization, enabling almost all *correct* optimizations implemented in GCC.
-- `-Ofast` does everything in `-O3`, plus a few more optimizations flags that may break strict standard compliance, but not in a way that would be critical for most applications (e. g. floating-point operations may be rearranged so that the result is off by a few bits of the mantissa).
+- `-Ofast` does everything in `-O3`, plus a few more optimizations flags that may break strict standard compliance, but not in a way that would be critical for most applications (e.g., floating-point operations may be rearranged so that the result is off by a few bits in the mantissa).
 
 There are also many other optimization flags that are not included even in `-Ofast`, because they are very situational, and enabling them by default is more likely to hurt performance rather than improve it — we will talk about some of them in [the next section](../situational).
 
@@ -21,7 +22,7 @@ There are also many other optimization flags that are not included even in `-Ofa
 
 The next thing you may want to do is to tell the compiler more about the computer(s) this code is supposed to be run on: the smaller the set of platforms is, the better. By default, it will generate binaries that can run on any relatively new (>2000) x86 CPU. The simplest way to narrow it down is to pass `-march` flag to specify the exact microarchitecture: `-march=haswell`. If you are compiling on the same computer that will run the binary, you can use `-march=native` for auto-detection.
 
-The instruction sets are generally backward-compatible, so it is often enough to just use the name of the oldest microarchitecture you need to support. A more robust approach is to list specific features that the CPU is guaranteed to have: `-mavx2`, `-mpopcount`. When you just want to *tune* the program for a particular machine without using any instructions that may crash it on incompatible CPUs, you can use the `-mtune` flag (by default `-march=x` also implies `-mtune=x`).
+The instruction sets are generally backward-compatible, so it is often enough to just use the name of the oldest microarchitecture you need to support. A more robust approach is to list specific features that the CPU is guaranteed to have: `-mavx2`, `-mpopcnt`. When you just want to *tune* the program for a particular machine without using any instructions that may crash it on incompatible CPUs, you can use the `-mtune` flag (by default `-march=x` also implies `-mtune=x`).
 
 These options can also be specified for a compilation unit with pragmas instead of compilation flags:
 
@@ -34,7 +35,7 @@ This is useful when you need to optimize a single high-performance procedure wit
 
 ### Multiversioned Functions
 
-Sometimes you may also want to provide several architecture-specific implementations in a single library. You can use attribute-based syntax to select between multiversioned functions automatically during compile-time:
+Sometimes you may also want to provide several architecture-specific implementations in a single library. You can use attribute-based syntax to select between multiversioned functions automatically during compile time:
 
 ```c++
 __attribute__(( target("default") )) // fallback implementation
diff --git a/content/english/hpc/compilation/limitations.md b/content/english/hpc/compilation/limitations.md
index 0c76946e..521f78e7 100644
--- a/content/english/hpc/compilation/limitations.md
+++ b/content/english/hpc/compilation/limitations.md
@@ -21,7 +21,7 @@ In general, when an optimization doesn't happen, it is usually because one of th
 
 - The compiler doesn't have enough information to know it will be beneficial.
 - The optimization is actually not always correct: there is an input on which the result doesn't comply with the spec, even if it is correct on every input that the programmer expects.
-- It isn't implemented in the compiler yet, either because it is too hard to implement in general, too costly to compute or too rare to be worth the trouble (e. g. writing a tiny library for some specific algorithm is usually better than hardcoding it into compiler).
+- It isn't implemented in the compiler yet, either because it is too hard to implement in general, too costly to compute or too rare to be worth the trouble (e.g., writing a tiny library for some specific algorithm is usually better than hardcoding it into compiler).
 
 In addition, optimization sometimes fails just due to the source code being overly complicated.
 
@@ -34,4 +34,4 @@ Usually the right approach to performance is to think how the main hot spots of
 2. Is there a real-world dataset for which the optimization may not be beneficial? (hints, pragmas, PGO)
 3. Are there at least 1000 other places where this optimization makes sense? (remove abstractions and implement it manually, add a feature request for GCC and Clang)
 
-In the majority of the cases, at least one of these answers will be "no", and then you will know what to do.
+In the majority of the cases, at least one of these answers will be "no," and then you will know what to do.
diff --git a/content/english/hpc/compilation/precalc.md b/content/english/hpc/compilation/precalc.md
index bd496d30..7de4c8fb 100644
--- a/content/english/hpc/compilation/precalc.md
+++ b/content/english/hpc/compilation/precalc.md
@@ -1,55 +1,73 @@
 ---
-title: Compile-Time Computation
+title: Precomputation
 weight: 8
-draft: true
 ---
 
-### Precalculation
+When compilers can infer that a certain variable does not depend on any user-provided data, they can compute its value during compile time and turn it into a constant by embedding it into the generated machine code.
 
-A compiler can compute constants on its own, but it doesn't *have to*.
+This optimization helps performance a lot, but it is not a part of the C++ standard, so compilers don't *have to* do that. When a compile-time computation is either hard to implement or time-intensive, a compiler may pass on that opportunity.
 
-```c++
-const int b = 4, B = (1 << b);
+### Constant Expressions
 
-// is it tight enough?
-constexpr int round(int k) {
-    return k / B * B; // (k & ~(B - 1));
-}
+For a more reliable solution, in modern C++ you can mark a function as `constexpr`; if it is called by passing constants its value is guaranteed to be computed during compile time:
 
-constexpr int height(int m) {
-    return (m == 0 ? 0 : height(m / B) + 1);
+```c++
+constexpr int fibonacci(int n) {
+    if (n <= 2)
+        return 1;
+    return fibonacci(n - 1) + fibonacci(n - 2);
 }
 
-constexpr int offset(int h) {
-    int res = 0;
-    int m = N;
-    while (h--) {
-        res += round(m) + B;
-        m /= B;
+static_assert(fibonacci(10) == 55);
+```
+
+These functions have some restrictions like that they only call other `constexpr` functions and can't do memory allocation, but otherwise, they are executed "as is."
+
+Note that while `constexpr` functions don't cost anything during run time, they still increase compilation time, so at least remotely care about their efficiency and don't put something NP-complete in them:
+
+```c++
+constexpr int fibonacci(int n) {
+    int a = 1, b = 1;
+    while (n--) {
+        int c = a + b;
+        a = b;
+        b = c;
     }
-    return res;
+    return b;
 }
+```
 
-constexpr int h = height(N);
-alignas(64) int t[offset(h)];
-//int t[N * B / (B - 1)]; // +1?
+There used to be many more limitations in earlier C++ standards, like you could not use any sort of state inside them and had to rely on recursion, so the whole process felt more like Haskell programming rather than C++. Since C++17, you can even compute static arrays using the imperative style, which is useful for precomputing lookup tables:
 
-struct Meta {
-    alignas(64) int mask[B][B];
+```c++
+struct Precalc {
+    int isqrt[1000];
 
-    constexpr Meta() : mask{} {
-        for (int k = 0; k < B; k++)
-            for (int i = 0; i < B; i++)
-                mask[k][i] = (i > k ? -1 : 0);
+    constexpr Precalc() : isqrt{} {
+        for (int i = 0; i < 1000; i++)
+            isqrt[i] = int(sqrt(i));
     }
 };
 
-constexpr Meta T;
+constexpr Precalc P;
+
+static_assert(P.isqrt[42] == 6);
 ```
 
+Note that when you call `constexpr` functions while passing non-constants, the compiler may or may not compute them during compile time:
+
+```c++
+for (int i = 0; i < 100; i++)
+    cout << fibonacci(i) << endl;
+```
+
+In this example, even though technically we perform a constant number of iterations and call `fibonacci` with parameters known at compile time, they are technically not compile-time constants. It's up to the compiler whether to optimize this loop or not — and for heavy computations, it often chooses not to.
+
+
diff --git a/content/english/hpc/compilation/situational.md b/content/english/hpc/compilation/situational.md
index ee758f06..41620c70 100644
--- a/content/english/hpc/compilation/situational.md
+++ b/content/english/hpc/compilation/situational.md
@@ -63,7 +63,7 @@ This is a new feature that only appeared in C++20. Before that, there were compi
 
 ```c++
 int factorial(int n) {
-    if (likely(n > 1))
+    if (__builtin_expect(n > 1, 1))
         return n * factorial(n - 1);
     else
         return 1;
@@ -96,13 +96,13 @@ The whole process is automated by modern compilers. For example, the `-fprofile-
 g++ -fprofile-generate [other flags] source.cc -o binary
 ```
 
-After we run the program — preferably on input that is as representative of real use case as possible — it will create a bunch of `*.gcda` files that contain log data for the test run, after which we can rebuild the program, but now adding the `-fprofile-use` flag:
+After we run the program — preferably on input that is as representative of the real use case as possible — it will create a bunch of `*.gcda` files that contain log data for the test run, after which we can rebuild the program, but now adding the `-fprofile-use` flag:
 
 ```
 g++ -fprofile-use [other flags] source.cc -o binary
 ```
 
-It usually improves performance by 10-20% for large codebases, and for this reason it is commonly included in the build process of performance-critical projects. One more reason to invest in solid benchmarking code.
+It usually improves performance by 10-20% for large codebases, and for this reason it is commonly included in the build process of performance-critical projects. This is more reason to invest in solid benchmarking code.
 
 
 
@@ -82,7 +82,7 @@ Dennard scaling has ended, but Moore's law is not dead yet.
 
 Clock rates plateaued, but the transistor count is still increasing, allowing for the creation of new, *parallel* hardware. Instead of chasing faster cycles, CPU designs started to focus on getting more useful things done in a single cycle. Instead of getting smaller, transistors have been changing shape.
 
-This resulted in increasingly complex architectures capable of doing dozens, hundreds, or even  thousands of different things every cycle.
+This resulted in increasingly complex architectures capable of doing dozens, hundreds, or even thousands of different things every cycle.
 
 ![Die shot of a Zen CPU core by AMD (~1,400,000,000 transistors)](../img/die-shot.jpg)
 
diff --git a/content/english/hpc/complexity/languages.md b/content/english/hpc/complexity/languages.md
index f719f08e..abb80979 100644
--- a/content/english/hpc/complexity/languages.md
+++ b/content/english/hpc/complexity/languages.md
@@ -1,7 +1,9 @@
 ---
 title: Programming Languages
-aliases: [/hpc/analyzing-performance]
+aliases:
+  - /hpc/analyzing-performance
 weight: 2
+published: true
 ---
 
 If you are reading this book, then somewhere on your computer science journey you had a moment when you first started to care about the efficiency of your code.
@@ -10,15 +12,15 @@ Mine was in high school, when I realized that making websites and doing *useful*
 
 I didn't know much about computer architecture to answer this question. But I also didn't need the right answer — I needed a rule of thumb. My thought process was: "2-3GHz means 2 to 3 billion instructions executed every second, and in a simple loop that does something with array elements, I also need to increment loop counter, check end-of-loop condition, do array indexing and stuff like that, so let's add room for 3-5 more instructions for every useful one" and ended up with using $5 \cdot 10^8$ as an estimate. None of these statements are true, but counting how many operations my algorithm needed and dividing it by this number was a good rule of thumb for my use case.
 
-The real answer, of course, is much more complicated and highly dependent on what kind of "operation" you have in mind. It can be as low as $10^7$ for things like [pointer chasing](/hpc/memory/latency) and as high as $10^{11}$ for [SIMD-accelerated](/hpc/simd) linear algebra. To demonstrate these striking differences, we will use the case study of matrix multiplication implemented in different languages — and dig deeper into how computers execute them.
+The real answer, of course, is much more complicated and highly dependent on what kind of "operation" you have in mind. It can be as low as $10^7$ for things like [pointer chasing](/hpc/cpu-cache/latency) and as high as $10^{11}$ for [SIMD-accelerated](/hpc/simd) linear algebra. To demonstrate these striking differences, we will use the case study of matrix multiplication implemented in different languages — and dig deeper into how computers execute them.
 
 
 
@@ -32,12 +34,12 @@ These instructions — called *machine code* — are binary encoded, quirky and
 
 -->
 
-On the lowest level, computers execute *machine code* consisting of binary-encoded *instructions* which are used to control the CPU. They are specific, quirky, and require a great deal of intellectual effort to work with, so one of the first things people did after creating computers was creating *programming languages*, which abstract away some details of how computers operate to simplify the process of programming.
+On the lowest level, computers execute *machine code* consisting of binary-encoded *instructions* which are used to control the CPU. They are specific, quirky, and require a great deal of intellectual effort to work with, so one of the first things people did after creating computers was create *programming languages*, which abstract away some details of how computers operate to simplify the process of programming.
 
 A programming language is fundamentally just an interface. Any program written in it is just a nicer higher-level representation which still at some point needs to be transformed into the machine code to be executed on the CPU — and there are several different means of doing that:
 
 - From a programmer's perspective, there are two types of languages: *compiled*, which pre-process before executing, and *interpreted*, which are executed during runtime using a separate program called *an interpreter*.
-- From a computer's perspective, there are also two types of languages: *native*, which directly execute machine code, and *managed*, which rely on some sort of *a runtime* to do it.
+- From a computer's perspective, there are also two types of languages: *native*, which directly execute machine code, and *managed*, which rely on some sort of *runtime* to do it.
 
 Since running machine code in an interpreter doesn't make sense, this makes a total of three types of languages:
 
@@ -45,7 +47,7 @@ Since running machine code in an interpreter doesn't make sense, this makes a to
 - Compiled languages with a runtime, such as Java, C#, or Erlang (and languages that work on their VMs, such as Scala, F#, or Elixir).
 - Compiled native languages, such as C, Go, or Rust.
 
-There is no "right" way of executing computer programs: each approach has its own gains and drawbacks. Interpreters and virtual machines provide flexibility and enable some nice high-level programming features such as dynamic typing, run-time code alteration, and automatic memory management, but this comes with some unavoidable performance trade-offs, which we will now talk about.
+There is no "right" way of executing computer programs: each approach has its own gains and drawbacks. Interpreters and virtual machines provide flexibility and enable some nice high-level programming features such as dynamic typing, run time code alteration, and automatic memory management, but these come with some unavoidable performance trade-offs, which we will now talk about.
 
 ### Interpreted languages
 
@@ -82,7 +84,7 @@ print(duration)
 
 This code runs in 630 seconds. That's more than 10 minutes!
 
-Let's try to put this number in perspective. The CPU that ran it has a clock frequency of 1.4GHz, meaning that it does $1.4 \cdot 10^9$ cycles per second, totaling to almost $10^{15}$ for the entire computation, and about 880 cycles per each multiplication in the innermost loop.
+Let's try to put this number in perspective. The CPU that ran it has a clock frequency of 1.4GHz, meaning that it does $1.4 \cdot 10^9$ cycles per second, totaling to almost $10^{15}$ for the entire computation, and about 880 cycles per multiplication in the innermost loop.
 
 This is not surprising if you consider the things that Python needs to do to figure out what the programmer meant:
 
@@ -92,7 +94,7 @@ This is not surprising if you consider the things that Python needs to do to fig
 - looks up its type, figures out that it's a `float`, and fetches the method implementing `*` operator;
 - does the same things for `b` and `c` and finally add-assigns the result to `c[i][j]`.
 
-Granted, the interpreters of widely-used languages such as Python are well-optimized, and they can skip through some of these steps on repeated execution of the same code. Buy still, some quite significant overhead is unavoidable due to the language design. If we get rid of all this type checking and pointer chasing, perhaps we can get cycles per multiplication ratio closer to 1, or whatever the "cost" of native multiplication is?
+Granted, the interpreters of widely used languages such as Python are well-optimized, and they can skip through some of these steps on repeated execution of the same code. But still, some quite significant overhead is unavoidable due to the language design. If we get rid of all this type checking and pointer chasing, perhaps we can get cycles per multiplication ratio closer to 1, or whatever the "cost" of native multiplication is?
 
 ### Managed Languages
 
@@ -173,9 +175,9 @@ int main() {
 
 It takes 9 seconds when you compile it with `gcc -O3`.
 
-It doesn't seem like a huge improvement — the 1-3 second advantage over Java and PyPy can be attributed to the additional time of JIT-compilation — but we haven't yet taken advantage of a far better C compiler ecosystem. If we add `-march=native` and `-ffast=math` flags, time suddenly goes down to 0.6 seconds!
+It doesn't seem like a huge improvement — the 1-3 second advantage over Java and PyPy can be attributed to the additional time of JIT-compilation — but we haven't yet taken advantage of a far better C compiler ecosystem. If we add `-march=native` and `-ffast-math` flags, time suddenly goes down to 0.6 seconds!
 
-What happened here is we [communicated to the compiler](/hpc/compilation/flags/) the exact model of the CPU we are running (`-march=native`) and gave it the freedom to rearrange [floating-point computations](/hpc/arithmetic/float) (`-ffast=math`), and so it took advantage of it and used [vectorization](/hpc/simd) to achieve this speedup.
+What happened here is we [communicated to the compiler](/hpc/compilation/flags/) the exact model of the CPU we are running (`-march=native`) and gave it the freedom to rearrange [floating-point computations](/hpc/arithmetic/float) (`-ffast-math`), and so it took advantage of it and used [vectorization](/hpc/simd) to achieve this speedup.
 
 It's not like it is impossible to tune the JIT-compilers of PyPy and Java to achieve the same performance without significant changes to the source code, but it is certainly easier for languages that compile directly to native code.
 
@@ -202,7 +204,7 @@ print(duration)
 
 Now it takes ~0.12 seconds: a ~5x speedup over the auto-vectorized C version and ~5250x speedup over our initial Python implementation!
 
-You don't typically see such dramatic improvements. For now, we are not ready to tell you exactly how this is achieved. Implementations of dense matrix multiplication in OpenBLAS are typically [5000 lines of handwritten assembly](https://github.com/xianyi/OpenBLAS/blob/develop/kernel/x86_64/dgemm_kernel_16x2_haswell.S) tailored separately for *each* architecture. In later chapters, we will explain all the relevant techniques one by one, and then return to this example and develop our own BLAS-level implementation using just under 40 lines of C.
+You don't typically see such dramatic improvements. For now, we are not ready to tell you exactly how this is achieved. Implementations of dense matrix multiplication in OpenBLAS are typically [5000 lines of handwritten assembly](https://github.com/xianyi/OpenBLAS/blob/develop/kernel/x86_64/dgemm_kernel_16x2_haswell.S) tailored separately for *each* architecture. In later chapters, we will explain all the relevant techniques one by one, and then [return](/hpc/algorithms/matmul) to this example and develop our own BLAS-level implementation using just under 40 lines of C.
 
 ### Takeaway
 
diff --git a/content/english/hpc/complexity/levels.md b/content/english/hpc/complexity/levels.md
index 84838709..9a792917 100644
--- a/content/english/hpc/complexity/levels.md
+++ b/content/english/hpc/complexity/levels.md
@@ -1,5 +1,5 @@
 ---
-title: Levels of Optimization
+title: When to Optimize
 weight: 4
 draft: true
 ---
@@ -26,16 +26,59 @@ In any case, the Big-O notation is not what companies really want. It is not abo
 
 You get especially frustrated if you had a competitive programming experience. You won't get to solve these type of problems, even if they asked them on an interview. To solve them, you need other type of qualifications. Asymptotically optimal algorithm already exists, you need to optimize the constant factor. Unfortunately, only a handful of universities teach that.
 
-## The Hierarchy of Optimization
+## The Levels of Optimization
 
 Programmers can be put in several "levels" in terms of their software optimization abilities:
 
-1. "Newbie". Those who don't think about performance at all. They usually write in high-level languages, sometimes in declarative / functional languages. Most "programmers" stay there (and there is nothing wrong with it).
-2. "Undergraduate student". Those who know about Big O notation and are familiar with basic data structures and approaches. LeetCode and CodeForces folks are there. This is also the requirement in getting into big companies — they have a lot of in-house software, large scale, and they are looking for people in the long term, so asking things like programming language.
-3. "Graduate student". Those who know that not all operations are created equal; know other cost models such as external memory model (B-tree, external sorting), word model (bitset,) or parallel computing, but still in theory.
-4. "Professional developer". Those who know actual timings of these operations. Aware that branch mispredictions are costly, memory is split into cache lines. Knows some basic SIMD techniques. 
-5. "Performance engineer". Know exactly what happens inside their hardware. Know the difference between latency and bandwidth, know about ports. Knows how to use SIMD and the rest of instruction set effectively. Can read assembly and use profilers.
+0. *Newbie*. Those who don't think about performance at all. They usually write in high-level languages, sometimes in declarative / functional languages. Most "programmers" stay there (and there is nothing wrong with it).
+1. *Undergraduate student*. Those who know about Big O notation and are familiar with basic data structures and approaches. LeetCode and CodeForces folks are there. This is also the requirement in getting into big companies — they have a lot of in-house software, large scale, and they are looking for people in the long term, so asking things like programming language.
+2. *Graduate student*. Those who know that not all operations are created equal; know other cost models such as external memory model (B-tree, external sorting), word model (bitset,) or parallel computing, but still in theory.
+3. *Professional developer*. Those who know actual timings of these operations. Aware that branch mispredictions are costly, memory is split into cache lines. Knows some basic SIMD techniques. 
+4. *Performance engineer*. Know exactly what happens inside their hardware. Know the difference between latency and bandwidth, know about ports. Knows how to use SIMD and the rest of instruction set effectively. Can read assembly and use profilers.
+5. *Intel employee*. Knows microarchitecture-specific details. This is outside of the purview of normal engineers.
 
-In this book, we expect that the average reader is somewhere around stage 2, and hopefully by the end of it will get to 5.
+In this book, we expect that the average reader is somewhere around stage 1, and hopefully by the end of it will get to 4.
 
 You should also go through these levels when designing algorithms. First get it working in the first place, then select a bunch of reasonably asymptotically optimal algorithm. Then think about how they are going to work in terms of their memory operations or ability to execute in parallel (even if you consider single-threaded programs, there is still going to be plenty of parallelism inside a core, so this model is extremely ), and then proceed toward actual implementation. Avoid premature optimization, as Knuth once said.
+
+---
+
+For most web services, efficiency doesn't matter, but *latency* does.
+
+Increasing efficiency is not how it is done nowadays.
+
+A pageview usually generates somewhere on the order of 0.1 to 1 cent per pageview. This is a typical rate at which you monetize user attention. Say, if I simply installed AdSense, i'd be getting something like that — depending on where most of my readers are from and how many of them are using an ad blocker.
+
+At the same time, a server with a dedicated core and 1GB of ram (which is an absurdly large amount of resources for a simple web service) costs around one millionth per second when amortized. You could fetch 100 photos with that.
+
+Amazon had an experiment where they A/B tested their service with artificial delays and found out that a 100ms delay decreased revenue. This follows for most other services, say, you lose your "flow" at twitter, the user is likely to start thinking on something else and leave. If the delay at Google is more than a few seconds, people will just think that Google isn't working and quit.
+
+Minimization of latency can be usually done with parallel computing, which is why distributed systems are scaled more on scalability. This part of the book is concerned with improving *efficiency* of algorithms, which makes latency lower as the by-product.
+
+However, there are still use cases when there is a trade-off between quality and cost of servers.
+
+- Search is hierarchical. There are usually many layers of more accurate but slower models. The more documents you rank on each layer, the better the final quality.
+- Games. They are more enjoyable on large scale, but computational power also increases. This includes AI.
+- AI workloads — those that have large quantities of data such as language models. Heavier models require more compute. The bottleneck in them is not the number of data, but efficiencty.
+
+Inherently sequential algorithms, or cases when the resources are constrained. Ctrl+f'ing a large PDF is painful. Factorization.
+
+## Estimating the impact
+
+Sometime the optimization needs to happen in the calling layer.
+
+SIMDJSON speeds up JSON parsing, but it may be better to not use JSON in the first place.
+
+Protobuf or flat binary formats.
+
+There is also a chicken and egg problem: people don't use an approach that much because it is slow and not feasible.
+
+Cost to implement, bugs, maintainability. It is perfectly fine that most software in the world is inefficient.
+
+What does it mean to be a better programmer? Faster programs? Faster speed of work? Fewer bugs? It is a combination of those.
+
+Implementing compiler optimizations or databases are examples of high-leverage activities because they act as a tax on everything else — which is why you see most people writing books on these particular topics rather than software optimization in general.
+
+---
+
+Factorization is kind of useless by itself, but it helps with understanding how to optimize number theoretic computations in general. Same goes for sorting and binary trees: most people hold some metainformation.
diff --git a/content/english/hpc/cpu-cache/_index.md b/content/english/hpc/cpu-cache/_index.md
index 9bde5517..ef1bbd6f 100644
--- a/content/english/hpc/cpu-cache/_index.md
+++ b/content/english/hpc/cpu-cache/_index.md
@@ -3,45 +3,50 @@ title: RAM & CPU Caches
 weight: 9
 ---
 
-In the previous chapter, we studied computer memory from theoretical standpoint, using the [external memory model](../external-memory) to estimate performance of memory-bound algorithms.
+In the [previous chapter](../external-memory), we studied computer memory from a theoretical standpoint, using the [external memory model](../external-memory/model) to estimate the performance of memory-bound algorithms.
 
-While it is more or less accurate for computations involving HDDs and network storage, where in-memory arithmetic is negligibly fast compared to external I/O operations, it becomes erroneous on lower levels in the cache hierarchy, where the costs of these operations become comparable.
+While the external memory model is more or less accurate for computations involving HDDs and network storage, where cost of arithmetic operations on in-memory values is negligible compared to external I/O operations, it is too imprecise for lower levels in the cache hierarchy, where the costs of these operations become comparable.
+
+To perform more fine-grained optimization of in-memory algorithms, we have to start taking into account the many specific details of the CPU cache system. And instead of studying loads of boring Intel documents with dry specs and theoretically achievable limits, we will estimate these parameters experimentally by running numerous small benchmark programs with access patterns that resemble the ones that often occur in practical code.
+
+
+
 
-### Recall: CPU Caches
+### Experimental Setup
 
-If you jumped to this page straight from Google or just forgot what [we've been doing](../), here is a brief summary of how memory operations work in CPUs:
+As before, I will be running all experiments on Ryzen 7 4700U, which is a "Zen 2" CPU with the following main cache-related specs:
 
-- In-between CPU registers and RAM, there is a hierarchy of *caches* that exist to speed up access to frequently used data: "lower" layers are faster, but more expensive and therefore smaller in size.
-- Caches are physically a part of CPU. Accessing them takes a fixed amount of time in CPU cycles, so their real access time is proportional to the clock rate. On the contrary, RAM is a separate chip with its own clock rate. Its latencies are therefore better measured in nanoseconds, and not cycles.
-- The CPU cache system operates on *cache lines*, which is the basic unit of data transfer between the CPU and the RAM. The size of a cache line is 64 bytes on most architectures, meaning that all main memory is divided into blocks of 64 bytes, and whenever you request (read or write) a single byte, you are also fetching all its 63 cache line neighbors whether your want them or not.
-- Memory requests can overlap in time: while you wait for a read request to complete, you can sand a few others, which will be executed concurrently. In some contexts that allow for many concurrent I/O operations it therefore makes more sense to talk abound memory *bandwidth* than *latency*.
-- Taking advantage of this free concurrency, it is often beneficial to *prefetch* data that you will likely be accessing soon, if you know its location. You can do this explicitly by using a separate instruction or just by accessing any byte in its cache line, but the most frequent patterns, such as linearly iterating forward or backward over an array, prefetching is already handled by hardware.
-- Caching is done transparently; when there isn't enough space to fit a new cache line, the least recently used one automatically gets evicted to the next, slower layer of cache hierarchy. The programmer can't control this process explicitly.
-- Since implementing "find the oldest among million cache lines" in hardware is unfeasible, each cache layer is split in a number of small "sets", each covering a certain subset of memory locations. *Associativity* is the size of these sets, or, in other terms, how many different "cells" of cache each data location can be mapped to. Higher associativity allows more efficient utilization of cache.
-- There are other types of cache inside CPUs that are used for things other than data. The most important for us are *instruction cache* (I-cache), which is used to speed up the fetching of machine code from memory, and *translation lookaside buffer* (TLB), which is used to store physical locations of virtual memory pages, which is instrumental to the efficiency of virtual memory.
+- 8 physical cores (without hyper-threading) clocked at 2GHz (and 4.1GHz in boost mode — [which we disable](/hpc/profiling/noise));
+- 256K of 8-way set associative L1 data cache or 32K per core;
+- 4M of 8-way set associative L2 cache or 512K per core;
+- 8M of 16-way set associative L3 cache, [shared](sharing) between 8 cores;
+- 16GB (2x8G) of DDR4 RAM @ 2667MHz.
 
-The last few points may be a bit hand-wavy, but don't worry: they will become clear as we go along with the experiments and demonstrate it all in action.
+You can compare it with your own hardware by running `dmidecode -t cache` or `lshw -class memory` on Linux or by installing [CPU-Z](https://en.wikipedia.org/wiki/CPU-Z) on Windows. You can also find additional details about the CPU on [WikiChip](https://en.wikichip.org/wiki/amd/ryzen_7/4700u) and [7-CPU](https://www.7-cpu.com/cpu/Zen2.html). Not all conclusions will generalize to every CPU platform in existence.
+
+
 
-- 8 physical cores (without hyper-threading) clocked at 2GHz[^boost];
-- 512K of 8-way set associative L1 cache, half of which is instruction cache — meaning 32K per core;
-- 4M of 8-way set associative L2 cache, or 512K per core;
-- 8M of 16-way set associative L3 cache, *shared* between 8 cores (4M actually);
-- 16G of DDR4 RAM @ 2667MHz.
+Due to difficulties in [preventing the compiler from optimizing away unused values](/hpc/profiling/noise/), the code snippets in this article are slightly simplified for exposition purposes. Check the [code repository](https://github.com/sslotin/amh-code/tree/main/cpu-cache) if you want to reproduce them yourself.
 
-[^boost]: Although the CPU can be clocked at 4.1GHz in boost mode, we will perform most experiments at 2GHz to reduce noise — so keep in mind that in realistic applications the numbers can be multiplied by 2.
+### Acknowledgements
 
-You can compare it with your own hardware by running `dmidecode -t cache` or `lshw -class memory` on Linux or just looking it up on WikiChip.
+This chapter is inspired by "[Gallery of Processor Cache Effects](http://igoro.com/archive/gallery-of-processor-cache-effects/)" by Igor Ostrovsky and "[What Every Programmer Should Know About Memory](https://people.freebsd.org/~lstewart/articles/cpumemory.pdf)" by Ulrich Drepper, both of which can serve as good accompanying readings.
 
-Due to difficulties in [refraining compiler from cheating](..//hpc/analyzing-performance/profiling/), the code snippets in this article are be slightly simplified for exposition purposes. Check the [code repository](https://github.com/sslotin/amh-code/tree/main/cpu-cache) if you want to reproduce them yourself.
+
diff --git a/content/english/hpc/cpu-cache/alignment.md b/content/english/hpc/cpu-cache/alignment.md
index 9c7a68c4..e9c5f4d3 100644
--- a/content/english/hpc/cpu-cache/alignment.md
+++ b/content/english/hpc/cpu-cache/alignment.md
@@ -1,13 +1,50 @@
 ---
-title: Data Alignment
-weight: 9
+title: Alignment and Packing
+weight: 8
 ---
 
-The fact that the memory is split into cache lines has huge implications on data structure layout. If you need to retrieve a certain atomic object, such as a 32-bit integer, you want to have it all located in a single cache line: both because hardware stitching results together takes precious transistor space and because retrieving 2 cache lines is slow and increases memory bandwidth. The "natural" alignment of `int` is 4 bytes.
+The fact that the memory is partitioned into 64B [cache lines](../cache-lines) makes it difficult to operate on data words that cross a cache line boundary. When you need to retrieve some primitive type, such as a 32-bit integer, you really want to have it located on a single cache line — both because retrieving two cache lines requires more memory bandwidth and stitching the results in hardware requires precious transistor space.
 
-For this reason, in C and most other programming languages structures by default pad structures with blank bytes in order to insure that every data member will not be split by a cache line boundary. Instead of playing a complex tetris game and rearranging its members, it simply pads each element so that the alignment of the next one matches its "natural" one. In addition, the data structure as a whole may be padded with a final unnamed member to allow each member of an array of structures to be properly aligned.
+This aspect heavily influences algorithm designs and how compilers choose the memory layout of data structures.
 
-Consider the following toy example:
+### Aligned Allocation
+
+By default, when you allocate an array of some primitive type, you are guaranteed that the addresses of all elements are a multiple of their size, which ensures that they only span a single cache line. For example, you are guaranteed the address of the first and every other element of an `int` array is a multiple of 4 bytes (`sizeof int`).
+
+Sometimes you need to ensure that this minimum alignment is higher. For example, many [SIMD](/hpc/simd) applications read and write data in blocks of 32 bytes, and it is [crucial for performance](/hpc/simd/moving) that these 32 bytes belong to the same cache line. In such cases, you can use the `alignas` specifier when defining a static array variable:
+
+```c++
+alignas(32) float a[n];
+```
+
+To allocate a memory-aligned array dynamically, you can use `std::aligned_alloc`, which takes the alignment value and the size of an array in bytes and returns a pointer to the allocated memory — just like the `new` operator does:
+
+```c++
+void *a = std::aligned_alloc(32, 4 * n);
+```
+
+You can also align memory to sizes [larger than the cache line](../paging). The only restriction is that the size parameter must be an integral multiple of alignment.
+
+You can also use the `alignas` specifier when defining a `struct`:
+
+```c++
+struct alignas(64) Data {
+    // ...
+};
+```
+
+Whenever an instance of `Data` is allocated, it will be at the beginning of a cache line. The downside is that the effective size of the structure will be rounded up to the nearest multiple of 64 bytes. This has to be done so that, e.g., when allocating an array of `Data`, not just the first element is properly aligned.
+
+### Structure Alignment
+
+This issue becomes more complicated when we need to allocate a group of non-uniform elements, which is the case for structures. Instead of playing Tetris trying to rearrange the members of a `struct` so that each of them is within a single cache line — which isn't always possible as the structure itself doesn't have to be placed on the start of a cache line — most C/C++ compilers also rely on the mechanism of memory alignment.
+
+Structure alignment similarly ensures that the address of all its member primitive types (`char`, `int`, `float*`, etc) are multiples of their size, which automatically guarantees that each of them only spans one cache line. It achieves that by:
+
+- *padding*, if necessary, each structure member with a variable number of blank bytes to satisfy the alignment requirement of the next member;
+- setting the alignment requirement of the structure itself to the maximum of the alignment requirements of its member types, so that when an array of the structure type is allocated or it is used as a member type in another structure, the alignment requirements of all its primitive types are satisfied.
+
+For better understanding, consider the following toy example:
 
 ```cpp
 struct Data {
@@ -18,29 +55,134 @@ struct Data {
 };
 ```
 
-When stored succinctly, it needs a total of $1 + 2 + 4 + 1 = 8$ bytes per instance, but doing so raises a few issues. Assuming that the whole structure has alignment of 4 (its largest member, `int`), `a` is fine, but `b`, `c` and `d` are not aligned.
+When stored succinctly, this structure needs a total of $1 + 2 + 4 + 1 = 8$ bytes per instance, but even assuming that the whole structure has the alignment of 4 bytes (its largest member, `int`), only `a` will be fine, while `b`, `c` and `d` are not size-aligned and potentially cross a cache line boundary.
 
-To fix this, compiler inserts unnamed members so that each next unaligned member gets to its alignment:
+To fix this, the compiler inserts some unnamed members so that each next member gets the right minimum alignment:
 
 ```cpp
 struct Data {
     char a;    // 1 byte
-    char x[1]; // 1 byte for the following 'short' to be aligned on a 2 byte boundary
+    char x[1]; // 1 byte for the following "short" to be aligned on a 2-byte boundary
     short b;   // 2 bytes 
-    int c;     // 4 bytes - largest structure member
+    int c;     // 4 bytes (largest member, setting the alignment of the whole structure)
     char d;    // 1 byte
     char y[3]; // 3 bytes to make total size of the structure 12 bytes (divisible by 4)
 };
+
+// sizeof(Data) = 12
+// alignof(Data) = alignof(int) = sizeof(int) = 4
+```
+
+This potentially wastes space but saves a lot of CPU cycles. This trade-off is mostly beneficial, so structure alignment is enabled by default in most compilers.
+
+### Optimizing Member Order
+
+Padding is only inserted before a not-yet-aligned member or at the end of the structure. By changing the ordering of members in a structure, it is possible to change the required number of padding bytes and the total size of the structure.
+
+In the previous example, we could reorder the structure members like this:
+
+```c++
+struct Data {
+    int c;
+    short b;
+    char a;
+    char d;
+};
+```
+
+Now, each of them is aligned without any padding, and the size of the structure is just 8 bytes. It seems stupid that the size of a structure and consequently its performance depends on the order of definition of its members, but this is required for binary compatibility.
+
+As a rule of thumb, place your type definitions from largest data types to smallest — this greedy algorithm is guaranteed to work unless you have some weird non-power-of-two type sizes such as the [10-byte](/hpc/arithmetic/ieee-754#float-formats) `long double`[^extended].
+
+[^extended]: The 80-bit `long double` takes *at least* 10 bytes, but the exact format is up to the compiler — for example, it may pad it to 12 or 16 bytes to minimize alignment issues (64-bit GCC and Clang use 16 bytes by default; you can override this by specifying one of `-mlong-double-64/80/128` or `-m96/128bit-long-double` [options](https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html)).
+
+
+
+### Structure Packing
+
+If you know what you are doing, you can disable structure padding and pack your data as tight as possible.
+
+You have to ask the compiler to do it, as such functionality is not a part of neither C nor C++ standard yet. In GCC and Clang, this is done with the `packed` attribute:
+
+```cpp
+struct __attribute__ ((packed)) Data {
+    long long a;
+    bool b;
+};
+```
+
+This makes the instances of `Data` take just 9 bytes instead of the 16 required by alignment, at the cost of possibly fetching two cache lines to reads its elements.
+
+### Bit fields
+
+You can also use packing along with *bit fields*, which allow you to explicitly fix the size of a member in bits:
+
+```cpp
+struct __attribute__ ((packed)) Data {
+    char a;     // 1 byte
+    int b : 24; // 3 bytes
+};
 ```
 
-Padding is only inserted when a structure member is followed by a member with a larger alignment requirement or at the end of the structure. By changing the ordering of members in a structure, it is possible to change the amount of padding required to maintain alignment. For example, if members are sorted by descending alignment requirements a minimal amount of padding is required. The minimal amount of padding required is always less than the largest alignment in the structure. Computing the maximum amount of padding required is more complicated, but is always less than the sum of the alignment requirements for all members minus twice the sum of the alignment requirements for the least aligned half of the structure members.
+This structure takes 4 bytes when packed and 8 bytes when padded. The number of bits a member has doesn't have to be a multiple of 8, and neither does the total structure size. In an array of `Data`, the neighboring elements will be "merged" in the case of a non-whole number of bytes. It also allows you to set a width that exceeds the base type, which acts as padding — although it throws a warning in the process.
+
+
+
+This feature is not so widespread because CPUs don't have 3-byte arithmetic or things like that and has to do some inefficient byte-by-byte conversion during loading:
+
+```cpp
+int load(char *p) {
+    char x = p[0], y = p[1], z = p[2];
+    return (x << 16) + (y << 8) + z;
+}
+```
 
-By default, when you allocate an array, the only guarantee about its alignment you get is that none of its elements are split by a cache line. For an array of `int`, this means that it gets the alignment of 4 bytes (`sizeof int`), which lets you load exactly one cache line when reading any element.
+The overhead is even larger when there is a non-whole byte — it needs to be handled with a shift and an and-mask.
 
-Alignment requirements can be declared not only for the data type, but for a particular variable. The typical use cases are allocating something the beginning of a 64-byte cache line, 32-byte SIMD block or a 4K memory page.
+This procedure can be optimized by loading a 4-byte `int` and then using a mask to discard its highest bits.
 
 ```cpp
-alignas(64) float a[n];
+int load(int *p) {
+    int x = *p;
+    return x & ((1<<24) - 1);
+}
 ```
 
-For allocating an array dynamically, we can use `std::aligned_alloc` which takes the alignment value and the size of array in bytes, and returns a pointer to the allocated memory (just like `new` does), which should be explicitly deleted when no longer used.
+Compilers usually don't do that because it's technically not legal: that 4th byte may be on a memory page that you don't own, so the operating system won't let you load it even if you are going to discard it right away.
diff --git a/content/english/hpc/cpu-cache/aos-soa.md b/content/english/hpc/cpu-cache/aos-soa.md
new file mode 100644
index 00000000..d5765339
--- /dev/null
+++ b/content/english/hpc/cpu-cache/aos-soa.md
@@ -0,0 +1,109 @@
+---
+title: AoS and SoA
+weight: 13
+---
+
+It is often beneficial to group together the data you need to fetch at the same time: preferably, on the same or, if that isn't possible, neighboring cache lines. This improves the [spatial locality](/hpc/external-memory/locality) of your memory accesses, positively impacting the performance of memory-bound algorithms.
+
+To demonstrate the potential effect of doing this, we modify the [pointer chasing](../latency) benchmark so that the next pointer is computed using not one, but a variable number of fields ($D$).
+
+### Experiment
+
+The first approach will locate these fields together as the rows of a two-dimensional array. We will refer to this variant as *array of structures* (AoS):
+
+```c++
+const int M = N / D; // # of memory accesses
+int p[M], q[M][D];
+
+iota(p, p + M, 0);
+random_shuffle(p, p + M);
+
+int k = p[M - 1];
+
+for (int i = 0; i < M; i++)
+    q[k][0] = p[i];
+
+    for (int j = 1; j < D; j++)
+        q[i][0] ^= (q[j][i] = rand());
+
+    k = q[k][0];
+}
+
+for (int i = 0; i < M; i++) {
+    int x = 0;
+    for (int j = 0; j < D; j++)
+        x ^= q[k][j];
+    k = x;
+}
+```
+
+And in the second approach, we will place them separately. The laziest way to do this is to transpose the two-dimensional array `q` and swap the indices in all its subsequent accesses:
+
+```c++
+int q[D][M];
+//    ^--^
+```
+
+By analogy, we call this variant *structure of arrays* (SoA). Obviously, for large $D$'s, it performs much worse:
+
+![](../img/aos-soa.svg)
+
+The performance of both variants grows linearly with $D$, but AoS needs to fetch up to 16 times fewer total cache lines as the data is stored sequentially. Even when $D=64$, the additional time it takes to process the other 63 values is less than the latency of the first fetch.
+
+You can also see the spikes at the powers of two. AoS performs slightly better because it can compute [horizontal xor-sum](/hpc/simd/reduction) faster with SIMD. In contrast, SoA performs much worse, but this isn't about $D$, but about $\lfloor N / D \rfloor$, the size of the second dimension, being a large power of two: this causes a pretty complicated [cache associativity](../associativity) effect.
+
+### Temporary Storage Contention
+
+At first, it seems like there shouldn't be any cache issues as $N=2^{23}$ and the array is just too big to fit into the L3 cache in the first place. The nuance is that to process a number of elements from different memory locations in parallel, you still need some space to store them temporarily. You can't simply use registers as there aren't enough of them, so they need to be stored in the cache even though in just a microsecond you won't be needing them.
+
+Therefore, when `N / D` is a large power of two, and we are iterating over the array `q[D][N / D]` along the first index, some of the memory addresses we temporarily need will map to the same cache line — and as there isn't enough space there, many of them will have to be re-fetched from the upper layers of the memory hierarchy.
+
+Here is another head-scratcher: if we enable [huge pages](../paging), it expectedly makes the total latency 10-15% lower for most values of $D$, but for $D=64$, it makes things ten times worse:
+
+![Note the logarithmic scale](../img/soa-hugepages.svg)
+
+I doubt that even the engineers who design memory controllers can explain what's happening right off the bat.
+
+In short, the difference is because, unlike the L1/L2 caches that are private to each core, the L3 cache has to use *physical* memory addresses instead of *virtual* ones for synchronization between different cores sharing the cache.
+
+When we are using 4K memory pages, the virtual addresses get somewhat arbitrarily dispersed over the physical memory, which makes the cache associativity problem less severe: the physical addresses will have the same remainder modulo 4K bytes, and not `N / D` as for the virtual addresses. When we specifically require huge pages, this maximum alignment limit increases to 2M, and the cache lines receive much more contention.
+
+This is the only example I know when enabling huge pages makes performance worse, let alone by a factor of ten.
+
+### Padded AoS
+
+As long as we are fetching the same number of cache lines, it doesn't matter where they are located, right? Let's test it and switch to [padded integers](../cache-lines) in the AoS code:
+
+```c++
+struct padded_int {
+    int val;
+    int padding[15];
+};
+
+const int M = N / D / 16;
+padded_int q[M][D];
+```
+
+Other than that, we are still calculating the xor-sum of $D$ padded integers. We fetch exactly $D$ cache lines, but this time sequentially. The running time shouldn't be different from SoA, but this isn't what happens:
+
+![](../img/aos-soa-padded.svg)
+
+The running time is about ⅓ lower for $D=63$, but this only applies to arrays that exceed the L3 cache. If we fix $D$ and change $N$, you can see that the padded version performs slightly worse on smaller arrays because there are less opportunities for random [cache sharing](../cache-lines):
+
+![](../img/aos-soa-padded-n.svg)
+
+As the performance on smaller arrays sizes is not affected, this clearly has something to do with how RAM works.
+
+### RAM-Specific Timings
+
+From the performance analysis point of view, all data in RAM is physically stored in a two-dimensional array of tiny capacitor cells, which is split into rows and columns. To read or write any cell, you need to perform one, two, or three actions:
+
+1. Read the contents of a row in a *row buffer*, which temporarily discharges the capacitors. 
+2. Read or write a specific cell in this buffer.
+3. Write the contents of a row buffer back into the capacitors so that the data is preserved and the row buffer can be used for other memory accesses.
+
+Here is the punchline: you don't have to perform steps 1 and 3 between two memory accesses that correspond to the same row — you can just use the row buffer as a temporary cache. These three actions take roughly the same time, so this optimization makes long sequences of row-local accesses run thrice as fast compared to dispersed access patterns.
+
+![](../img/ram.png)
+
+The size of the row differs depending on the hardware, but it is usually somewhere between 1024 and 8192 bytes. So even though the padded AoS benchmark places each element in a separate cache line, they are still very likely to be on the same RAM row, and the whole read sequence runs in roughly ⅓ of the time plus the latency of the first memory access.
diff --git a/content/english/hpc/cpu-cache/associativity.md b/content/english/hpc/cpu-cache/associativity.md
index f1cdd77e..b9f278ee 100644
--- a/content/english/hpc/cpu-cache/associativity.md
+++ b/content/english/hpc/cpu-cache/associativity.md
@@ -1,62 +1,107 @@
 ---
 title: Cache Associativity
-weight: 8
+weight: 11
 ---
 
-If you looked carefully, you could notice patterns while inspecting the dots below the graph in the previous experiment. These are not just noise: certain step sizes indeed perform much worse than their neighbors.
-
-For example, the stride of 256 corresponding to this loop:
+Consider a [strided incrementing loop](../cache-lines) over an array of size $N=2^{21}$ with a fixed step size of 256:
 
 ```cpp
 for (int i = 0; i < N; i += 256)
     a[i]++;
 ```
 
-and this one
+And then this one, with the step size of 257:
 
 ```cpp
 for (int i = 0; i < N; i += 257)
     a[i]++;
 ```
 
-differ by more than 10x: 256 runs at 0.067 while 257 runs at 0.751.
+Which one will be faster to finish? There are several considerations that come to mind:
+
+- At first, you think that there shouldn't be much difference, or maybe that the second loop is $\frac{257}{256}$ times faster or so because it does fewer iterations in total.
+- Then you recall that 256 is a nice round number, which may have something to do with [SIMD](/hpc/simd) or the memory system, so maybe the first one is faster.
+
+But the right answer is very counterintuitive: the second loop is faster — and by a factor of 10.
+
+This isn't just a single bad step size. The performance degrades for all indices that are multiples of large powers of two:
+
+![The array size is normalized so that the total number of iterations is constant](../img/strides-small.svg)
+
+There is no vectorization or anything, and the two loops produce the same assembly except for the step size. This effect is due only to the memory system, in particular to a feature called *cache associativity*, which is a peculiar artifact of how CPU caches are implemented in hardware.
+
+### Hardware Caches
+
+When we were studying the memory system [theoretically](/hpc/external-memory), we discussed different ways one can [implement cache eviction policies](/hpc/external-memory/policies/) in software. One particular strategy we focused on was the *least recently used* (LRU) policy, which is simple and effective but still requires some non-trivial data manipulation.
+
+In the context of hardware, such scheme is called *fully associative cache*: we have $M$ cells, each capable of holding a cache line corresponding to any of the $N$ total memory locations, and in case of contention, the one not accessed the longest gets kicked out and replaced with the new one.
+
+![Fully associative cache](../img/cache1.png)
+
+The problem with fully associative cache is that implementing the "find the oldest cache line among millions" operation is pretty hard to do in software and just unfeasible in hardware. You can make a fully associative cache that has 16 entries or so, but managing hundreds of cache lines already becomes either prohibitively expensive or so slow that it's not worth it.
 
-This is not just a single specific bad value: it is the same for all indices that are multiple of large powers of two, and it continues much further to the right.
+We can resort to another, much simpler approach: just map each block of 64 bytes in RAM to a single cache line which it can occupy. Say, if we have 4096 blocks in memory and 64 cache lines for them, then each cache line at any time stores the contents of one of $\frac{4096}{64} = 64$ different blocks.
 
-![](../img/strides-two.svg)
+![Direct-mapped cache](../img/cache2.png)
 
-This effect is due to a feature called *cache associativity*, and an interesting artifact of how CPU caches are implemented in hardware.
+A direct-mapped cache is easy to implement doesn't require storing any additional meta-information associated with a cache line except its tag (the actual memory location of a cached block). The disadvantage is that the entries can be kicked out too quickly — for example, when bouncing between two addresses that map to the same cache line — leading to lower overall cache utilization.
 
-When studying memory theoretically using the external memory model, we discussed different ways one can [implement caching policies](/hpc/memory/locality/) in software, and went into detail on particular case of a simple but effective strategy, LRU, which required some non-trivial data manipulation. In the context of hardware, such scheme is called *fully associative cache*.
+For that reason, we settle for something in-between direct-mapped and fully associative caches: the *set-associative cache*. It splits the address space into equal groups, which separately act as small fully-associative caches.
 
-![Fully associative cache](../img/cache2.png)
+![Set-associative cache (2-way associative)](../img/cache3.png)
 
-The problem with it is that implementing something like that is prohibitively expensive. In hardware, you can implement something when you have 16 entries or so, but it becomes unfeasible when it comes to storing and managing hundreds of cache lines.
+*Associativity* is the size of these sets, or, in other words, how many different cache lines each data block can be mapped to. Higher associativity allows for more efficient utilization of cache but also increases the cost.
 
-We can resort to another, much simpler approach: we could just map each block of 64 bytes in RAM to a cache line which it can possibly occupy. Say if in we have 4096 blocks in memory and 64 cache lines for them, this means that each cache line at any time stores the value of one of $\frac{4096}{64} = 64$ different blocks, along with a "tag" information which helps identifying which block it is.
+For example, on [my CPU](https://en.wikichip.org/wiki/amd/ryzen_7/4700u), the L3 cache is 16-way set-associative, and there are 4MB available to a single core. This means that there are in total $\frac{2^{22}}{2^{6}} = 2^{16}$ cache lines, which are split into $\frac{2^{16}}{16} = 2^{12}$ groups, each acting as a fully associative cache of their own $(\frac{1}{2^{12}})$-th fraction of the RAM.
 
-Simply speaking, the CPU just maintains these cells containing data, and when reading any cell from the main memory the CPU first looks it up in the cache, and if it contains the data, it reads it, and otherwise goes to a higher cache level until it reaches main memory. Simple and beautiful.
+Most other CPU caches are also set-associative, including the non-data ones such as the instruction cache and the TLB. The exceptions are small specialized caches that only house 64 or fewer entries — these are usually fully associative.
 
-![Direct-mapped cache](../img/cache1.png)
+### Address Translation
 
-Direct-mapped cache is easy to implement, but the problem with it is that the entries can be kicked out way too quickly, leading to lower cache utilization. In fact, we could just bounce between two addresses, leaving
+There is only one ambiguity remaining: how exactly the cache line mapping is done.
 
-For that reason, we settle for something in-between direct-mapped and fully associative cache: the *set-associative cache*. It splits addresses into groups which separately act as small fully-associative cache.
+If we implemented set-associative cache in software, we would compute some hash function of the memory block address and then use its value as the cache line index. In hardware, we can't really do that because it is too slow: for example, for the L1 cache, the latency requirement is 4 or 5 cycles, and even [taking a modulo](/hpc/arithmetic/division) takes around 10-15 cycles, let alone something more sophisticated.
 
-![Set-associative cache](../img/cache3.png)
+Instead, the hardware uses the lazy approach. It takes the memory address that needs to be accessed and splits it into three parts — from lower bits to higher:
 
-*Associativity* is the size of such sets — for example 16 meaning that this way we would need to wait at least 16 reads for an entry to get kicked out. Different cache layers may have different associativity. Most CPU caches are set-associative, unless we are talking about small specialized ones that only house 64 or less entries and can get by with fully-associative schemes.
+- *offset* — the index of the word within a 64B cache line ($\log_2 64 = 6$ bits);
+- *index* — the index of the cache line set (the next $12$ bits as there are $2^{12}$ cache lines in the L3 cache);
+- *tag* — the rest of the memory address, which is used to tell the memory blocks stored in the cache lines apart.
 
-If we implemented cache in software, we would compute some hash function to use as key. In hardware, we can't really do that because e. g. for L1 cache 4 or 5 cycles is all we got, and even taking a modulo takes 10-15 cycles, let alone something cryptographically secure. Therefore, hardware takes a different approach and calculates this address based on the address. It takes the address, and reinterprets it in three parts:
+In other words, all memory addresses with the same "middle" part map to the same set.
 
-![](../img/address.png)
+![Address composition for a 64-entry 2-way set-associative cache](../img/address.png)
 
-The last part is used for determining the cache line it is mapped to. All addresses with the same "middle" part will therefore map to the same set.
+This makes the cache system simpler and cheaper to implement but also susceptible to certain bad access patterns.
 
-Now, where were we? Oh yes, the reason why iterating with strides of 256 has such a terrible slowdown. This because they all map to the same set, and effectively the size of the cache (and all below it) shrinks by 256/16=16. No longer being able to reside in L2, it spills all the way to the order-of-magnitude slower RAM, which causes the expected slowdown.
+### Pathological Mappings
 
-This issue arises with remarkable frequency in all types of algorithms that love powers of two. Luckily, this behavior is more of an anomaly than some that needs to be dealt with. The solution is usually simple: avoid iterating in powers of two, using different sizer on 2d arrays or inserting "holes" in the memory layout.
+Now, where were we? Oh, yes: the reason why iteration with strides of 256 causes such a terrible slowdown.
+
+When we jump over 256 integers, the pointer always increments by $1024 = 2^{10}$, and the last 10 bits remain the same. Since the cache system uses the lower 6 bits for the offset and the next 12 for the cache line index, we are essentially using just $2^{12 - (10 - 6)} = 2^8$ different sets in the L3 cache instead of $2^{12}$, which has the effect of shrinking our L3 cache by a factor of $2^4 = 16$. The array stops fitting into the L3 cache ($N=2^{21}$) and spills into the order-of-magnitude slower RAM, which causes the performance to decrease.
+
+
+
+Performance issues caused by cache associativity effects arise with remarkable frequency in algorithms because, for multiple reasons, programmers just love using powers of two when indexing arrays:
+
+- It is easier to calculate the address for multi-dimensional array accesses if the last dimension is a power of two, as it only requires a binary shift instead of a multiplication.
+- It is easier to calculate modulo a power of two, as it can be done with a single bitwise `and`.
+- It is convenient and often even necessary to use power-of-two problem sizes in divide-and-conquer algorithms.
+- It is the smallest integer exponent, so using the sequence of increasing powers of two as problem sizes are a popular choice when benchmarking memory-bound algorithms.
+- Also, more natural powers of ten are by transitivity divisible by a slightly lower power of two.
+
+This especially often applies to implicit data structures that use a fixed memory layout. For example, [binary searching](/hpc/data-structures/binary-search) over arrays of size $2^{20}$ takes about ~360ns per query while searching over arrays of size $(2^{20} + 123)$ takes ~300ns. When the array size is a multiple of a large power of two, then the indices of the "hottest" elements, the ones we likely request on the first dozen or so iterations, will also be divisible by some large powers of two and map to the same cache line — kicking each other out and causing a ~20% performance decrease.
+
+Luckily, such issues are more of an anomaly rather than serious problems. The solution is usually simple: avoid iterating in powers of two, make the last dimensions of multi-dimensional arrays a slightly different size or use any other method to insert "holes" in the memory layout, or create some seemingly random bijection between the array indices and the locations where the data is actually stored.
+
+
diff --git a/content/english/hpc/cpu-cache/bandwidth.md b/content/english/hpc/cpu-cache/bandwidth.md
index ad04186b..472a5689 100644
--- a/content/english/hpc/cpu-cache/bandwidth.md
+++ b/content/english/hpc/cpu-cache/bandwidth.md
@@ -1,11 +1,17 @@
 ---
 title: Memory Bandwidth
-weight: 2
+weight: 1
+published: true
 ---
 
-For many algorithms, memory bandwidth is the most important characteristic of the cache system. Coincidentally, it is also the easiest to measure.
+On the data path between the CPU registers and the RAM, there is a hierarchy of *caches* that exist to speed up access to frequently used data: the layers closer to the processor are faster but also smaller in size. The word "faster" here applies to two closely related but separate timings:
 
-For our benchmark, let's create an array and linearly iterate over it $K$ times, incrementing its values:
+- The delay between the moment when a read or a write is initiated and when the data arrives (*latency*).
+- The number of memory operations that can be processed per unit of time (*bandwidth*).
+
+For many algorithms, memory bandwidth is the most important characteristic of the cache system. And at the same time, it is also the easiest to measure, so we are going to start with it.
+
+For our experiment, we create an array and iterate over it $K$ times, incrementing its values:
 
 ```cpp
 int a[N];
@@ -15,29 +21,89 @@ for (int t = 0; t < K; t++)
         a[i]++;
 ```
 
-Changing $N$ and adjusting $K$ so that the total number of cells accessed remains roughly constant, and normalizing the timings as "operations per second", we get the following results:
+Changing $N$ and adjusting $K$ so that the total number of array cells accessed remains roughly constant and expressing the total time in "operations per second," we get a graph like this:
 
 ![Dotted vertical lines are cache layer sizes](../img/inc.svg)
 
-You can clearly see the sizes of the cache layers on this graph. When the whole array fits into the lowest layer of cache, the program is bottlenecked by CPU rather than L1 cache bandwidth. As the the array becomes larger, overhead becomes smaller, and the performance approaches this theoretical maximum. But then it drops: first to ~12 GFLOPS when it exceeds L1 cache, and then gradually to about 2.1 GFLOPS when it can no longer fit in L3.
+You can clearly see the cache sizes on this graph:
+
+- When the whole array fits into the lowest layer of cache, the program is bottlenecked by the CPU rather than the L1 cache bandwidth. As the array becomes larger, the overhead associated with the first iterations of the loop becomes smaller, and the performance gets closer to its theoretical maximum of 16 GFLOPS.
+- But then the performance drops: first to 12-13 GFLOPS when it exceeds the L1 cache, and then gradually to about 2 GFLOPS when it can no longer fit in the L3 cache.
+
+This situation is typical for many lightweight loops.
 
-All CPU cache layers are placed on the same microchip as the processor, so bandwidth, latency, all its other characteristics scale with the clock frequency. RAM, on the other side, lives on its own clock, and its characteristics remain constant. This can be seen on these graphs if we run the same benchmark while turning frequency boost on:
+### Frequency Scaling
+
+All CPU cache layers are placed on the same microchip as the processor, so the bandwidth, latency, and all its other characteristics scale with the clock frequency. The RAM, on the other side, lives on its own fixed clock, and its characteristics remain constant. We can observe this by re-running the same benchmarking with turbo boost on:
 
 ![](../img/boost.svg)
 
-To reduce noise, we will run all the remaining benchmarks at plain 2GHz — but the lesson to retain here is that the relative performance of different approaches or decisions between algorithm designs may depend on the clock frequency — unless when we are working with datasets that either fit in cache entirely.
+This detail comes into play when comparing algorithm implementations. When the working dataset fits in the cache, the relative performance of the two implementations may be different depending on the CPU clock rate because the RAM remains unaffected by it (while everything else does not).
+
+For this reason, it is [advised](/hpc/profiling/noise) to keep the clock rate fixed, and as the turbo boost isn't stable enough, we run most of the benchmarks in this book at plain 2GHz.
 
-
+### Directional Access
 
-**Exercise: theoretical peak performance.** By the way, assuming infinite bandwidth, what would the throughput of that loop be? How to verify that the 14 GFLOPS figure is the CPU limit and not L1 peak bandwidth? For that we need to look a bit closer at how the processor will execute the loop.
+This incrementing loop needs to perform both reads and writes during its execution: on each iteration, we fetch a value, increment it, and then write it back. In many applications, we only need to do one of them, so let’s try to measure unidirectional bandwidth.
 
-Incrementing an array can be done with SIMD; when compiled, it uses just two operations per 8 elements — performing the read-fused addition and writing the result back:
+Calculating the sum of an array only requires memory reads:
 
-```asm
-vpaddd  ymm0, ymm1, YMMWORD PTR [rax]
-vmovdqa YMMWORD PTR [rax], ymm0
+```c++
+for (int i = 0; i < N; i++)
+    s += a[i];
 ```
 
-This computation is bottlenecked by the write, which has a throughput of 1. This means that we can theoretically increment and write back 8 values per cycle on average, yielding the performance of 2 GHz × 8 = 16 GFLOPS (or 32.8 in boost mode), which is fairly close to what we observed.
+And zeroing an array (or filling it with any other constant value) only requires memory writes:
+
+```c++
+for (int i = 0; i < N; i++)
+    a[i] = 0;
+```
+
+Both loops are trivially [vectorized](/hpc/simd) by the compiler, and the second one is actually replaced with a `memset`, so the CPU is also not the bottleneck here (except when the array fits into the L1 cache).
+
+![](../img/directional.svg)
+
+The reason why unidirectional and bidirectional memory accesses would perform differently is that they share the cache and memory buses and other CPU facilities. In the case of RAM, this causes a twofold difference in performance between the pure read and simultaneous read and write scenarios because the memory controller has to switch between the modes on the one-way memory bus, thus halving the bandwidth. The performance drop is less severe for the L2 cache: the bottleneck here is not the cache bus, so the incrementing loop loses by only ~15%.
+
+There is one interesting anomaly on the graph, namely that the write-only loop performs the same as the read-and-write one when the array hits the L3 cache and the RAM. This is because the CPU moves the data to the highest level of cache on each access, whether it is a read or a write — which is typically a good optimization, as in many use cases we will be needing it soon. When reading data, this isn't a problem, as the data travels through the cache hierarchy anyway, but when writing, this causes another implicit read to be dispatched right after a write — thus requiring twice the bus bandwidth.
+
+### Bypassing the Cache
+
+We can prevent the CPU from prefetching the data that we just have written by using *non-temporal* memory accesses. To do this, we need to re-implement the zeroing loop more directly without relying on compiler vectorization.
+
+Ignoring a few special cases, what `memset` and auto-vectorized assignment loops do under the hood is they just [move](/hpc/simd/moving) 32-byte blocks of data with [SIMD instructions](/hpc/simd):
+
+```c++
+const __m256i zeros = _mm256_set1_epi32(0);
+
+for (int i = 0; i + 7 < N; i += 8)
+    _mm256_store_si256((__m256i*) &a[i], zeros);
+```
+
+We can replace the usual vector store intrinsic with a *non-temporal* one:
+
+```c++
+const __m256i zeros = _mm256_set1_epi32(0);
+
+for (int i = 0; i + 7 < N; i += 8)
+    _mm256_stream_si256((__m256i*) &a[i], zeros);
+```
+
+Non-temporal memory reads or writes are a way to tell the CPU that we won't be needing the data that we have just accessed in the future, so there is no need to read the data back after a write.
+
+![](../img/non-temporal.svg)
+
+On the one hand, if the array is small enough to fit into the cache, and we actually access it some short time after, this has a negative effect because we have to read entirely it from the RAM (or, in this case, we have to *write* it into the RAM instead of using a locally cached version). And on the other, this prevents read-backs and lets us use the memory bus more efficiently.
+
+In fact, the performance increase in the case of the RAM is even more than 2x and faster than the read-only benchmark. This happens because:
+
+- the memory controller doesn't have to switch the bus between read and write modes this way;
+- the instruction sequence becomes simpler, allowing for more pending memory instructions;
+- and, most importantly, the memory controller can simply "fire and forget" non-temporal write requests — while for reads, it needs to remember what to do with the data once it arrives (similar to connection handles in networking software).
+
+Theoretically, both requests should use the same bandwidth: a read request sends an address and gets data, and a non-temporal write request sends an address *with* data and gets nothing. Not accounting for the direction, we transmit the same data, but the read cycle will be longer because it needs to wait for the data to be fetched. Since [there is a practical limit](../mlp) on how many concurrent requests the memory system can handle, this difference in read/write cycle latency also results in the difference in their bandwidth.
+
+Also, for these reasons, a single CPU core usually [can't fully saturate the memory bandwidth](../sharing).
 
-On all modern architectures, you can typically assume that you won't ever be bottlenecked by the throughput of L1 cache, but rather by the read/write execution ports or the arithmetic. In these extreme cases, it may be beneficial to store some data in registers without touching any of the memory, which we will cover later in the book.
+The same technique generalizes to `memcpy`: it also just moves 32-byte blocks with SIMD load/store instructions, and it can be similarly made non-temporal, increasing the throughput twofold for large arrays. There is also a non-temporal load instruction (`_mm256_stream_load_si256`) for when you want to *read* without polluting cache (e.g., when you don't need the original array after a `memcpy`, but will need some data that you had accessed before calling it).
diff --git a/content/english/hpc/cpu-cache/cache-lines.md b/content/english/hpc/cpu-cache/cache-lines.md
index 80772983..4ba63632 100644
--- a/content/english/hpc/cpu-cache/cache-lines.md
+++ b/content/english/hpc/cpu-cache/cache-lines.md
@@ -1,28 +1,48 @@
 ---
 title: Cache Lines
-weight: 4
+weight: 3
 ---
 
-The most important feature of the memory system is that it deals with cache lines, and not individual bytes.
+The basic units of data transfer in the CPU cache system are not individual bits and bytes, but *cache lines*. On most architectures, the size of a cache line is 64 bytes, meaning that all memory is divided in blocks of 64 bytes, and whenever you request (read or write) a single byte, you are also fetching all its 63 cache line neighbors whether your want them or not.
 
-To demonstrate this, let's add "step" parameter to our loop — we will now increment every $D$-th element:
+To demonstrate this, we add a "step" parameter to our [incrementing loop](../bandwidth). Now we only touch every $D$-th element:
  
 ```cpp
-for (int t = 0; t < K; t++)
-    for (int i = 0; i < N; i += D)
-        a[i]++;
+for (int i = 0; i < N; i += D)
+    a[i]++;
 ```
 
-When we run it with $D=16$, we can observe something interesting:
+If we run it with $D=1$ and $D=16$, we can observe something interesting:
 
 ![Performance is normalized by the total time to run benchmark, not the total number of elements incremented](../img/strided.svg)
 
-As the problem size grows, the graphs of the two loops meet, despite one doing 16 times less work than the other. This is because in terms of cache lines, we are fetching exactly the same memory; the fact that the strided computation only needs one sixteenth of it is irrelevant.
+As the problem size grows, the graphs of the two loops meet, despite one doing 16 times less work than the other. This is because, in terms of cache lines, we are fetching the exact same memory in both loops, and the fact that the strided loop only needs one-sixteenth of it is irrelevant.
 
-It does work a bit faster when the array fits into lower layers of cache because the loop becomes much simples: all it does is `inc DWORD PTR [rdx]` (yes, x86 has instructions that only involve memory locations and no registers or immediate values). It also has a throughput of one, but while the former code needed to perform two of writes per cache line, this only needs one, hence it works twice as fast when memory is not a concern.
+When the array fits into the L1 cache, the strided version completes faster — although not 16 but just two times as fast. This is because it only needs to do half the work: it only executes a single `inc DWORD PTR [rdx]` instruction for every 16 elements, while the original loop needed two 8-element [vector instructions](/hpc/simd) to process the same 16 elements. Both computations are bottlenecked by writing the result back: Zen 2 can only write one word per cycle — regardless of whether it is composed of one integer or eight.
 
-When we change the step parameter to 8, the graphs equalize:
+When we change the step parameter to 8, the graphs equalize, as we now also need two increments and two write-backs per every 16 elements:
 
 ![](../img/strided2.svg)
 
-The important lesson is to count the number of cache lines to fetch when analyzing memory-bound algorithms, and not the total count of memory accesses. This becomes increasingly important with larger problem sizes.
+We can use this effect to minimize cache sharing in our [latency benchmark](../latency) to measure it more precisely. We need to *pad* the indices of a permutation so that each of them lies in its own cache line:
+
+```c++
+struct padded_int {
+    int val;
+    int padding[15];
+};
+
+padded_int q[N / 16];
+
+// constructing a cycle from a random permutation
+// ...
+
+for (int i = 0; i < N / 16; i++)
+    k = q[k].val;
+```
+
+Now, each index is much more likely to be kicked out of the cache by the time we loop around and request it again:
+
+![](../img/permutation-padded.svg)
+
+The important practical lesson when designing and analyzing memory-bound algorithms is to count the number of cache lines accessed and not just the total count of memory reads and writes.
diff --git a/content/english/hpc/cpu-cache/hw-prefetching.md b/content/english/hpc/cpu-cache/hw-prefetching.md
deleted file mode 100644
index 0c836011..00000000
--- a/content/english/hpc/cpu-cache/hw-prefetching.md
+++ /dev/null
@@ -1,41 +0,0 @@
----
-title: Hardware Prefetching
-weight: 5
----
-
-In the bandwidth benchmark, we iterated over array and fetched its elements. Although separately each memory read in that case is not different from the fetch in pointer chasing, they run much faster because they can are overlapped: and in fact, CPU issues read requests in advance without waiting for the old ones to complete, so that the results come about the same time as the CPU needs them.
-
-In fact, this sometimes works even when we are not sure which instruction is going to be executed next. Consider the following example:
-
-```cpp
-bool cond = some_long_memory_operation();
-
-if (cond)
-    do_this_fast_operation();
-else
-    do_that_fast_operation();
-```
-
-What most modern CPUs do is they start evaluating one (most likely) branch without waiting for the condition to be computed. If they are right, then you will progress faster, and if they are wrong, the worst thing will happen is they discard some useless computation. This includes memory operations too, including cache system — because, well, we wait for a hundred cycles anyway, why not evaluate at least one of the branches ahead of time. By the way, this is what Meltdown was all about.
-
-This general technique of hiding latency with bandwidth is called *prefetching* — and it can be either implicit or explicit. CPU automatically running ahead in the pipeline is just one way to use it. Hardware can figure out even without looking at the future instructions, and just by analyzing memory access patterns. Hiding latency is crucial — it is pretty much the single most important idea we keep coming back to in this book. Apart from having a very large pipeline and using the fact that scheduler can look ahead in it, modern memory controllers can detect simple patterns such as iterating backwards, forwards, including using constant small-ish strides.
-
-Here is how to test it: we now generate our permutation in a way that makes us load consecutive cache lines, but we fetch elements in random order inside the cache lines.
-
-```cpp
-int p[15], q[N];
-
-iota(p, p + 15, 1);
-
-for (int i = 0; i + 16 < N; i += 16) {
-    random_shuffle(p, p + 15);
-    int k = i;
-    for (int j = 0; j < 15; j++)
-        k = q[k] = i + p[j];
-    q[k] = i + 16;
-}
-```
-
-The latency here remains constant at 3ns regardless (or whatever is the latency of pointers / bit fields implementation).
-
-Hardware prefetching is usually powerful enough for most cases. You can iterate over multiple arrays, sometimes with small strides, or load just small amounts. It is as intelligent and detrimental to performance as branch prediction.
diff --git a/content/english/hpc/cpu-cache/img/aos-soa-padded-n.svg b/content/english/hpc/cpu-cache/img/aos-soa-padded-n.svg
new file mode 100644
index 00000000..2c554a2c
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/aos-soa-padded-n.svg
@@ -0,0 +1,1330 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+     
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+   
+    
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/aos-soa-padded.svg b/content/english/hpc/cpu-cache/img/aos-soa-padded.svg
new file mode 100644
index 00000000..e5132965
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/aos-soa-padded.svg
@@ -0,0 +1,1371 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+     
+     
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+   
+    
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/aos-soa.svg b/content/english/hpc/cpu-cache/img/aos-soa.svg
new file mode 100644
index 00000000..14219dd5
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/aos-soa.svg
@@ -0,0 +1,1267 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+     
+     
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+   
+    
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/directional.svg b/content/english/hpc/cpu-cache/img/directional.svg
new file mode 100644
index 00000000..9badafe2
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/directional.svg
@@ -0,0 +1,1390 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+   
+    
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/latency-bandwidth.svg b/content/english/hpc/cpu-cache/img/latency-bandwidth.svg
new file mode 100644
index 00000000..3f70b11a
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/latency-bandwidth.svg
@@ -0,0 +1,1074 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/non-temporal.svg b/content/english/hpc/cpu-cache/img/non-temporal.svg
new file mode 100644
index 00000000..78ca301e
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/non-temporal.svg
@@ -0,0 +1,1480 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+   
+    
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/parallel-bandwidth.svg b/content/english/hpc/cpu-cache/img/parallel-bandwidth.svg
new file mode 100644
index 00000000..11ff2798
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/parallel-bandwidth.svg
@@ -0,0 +1,1146 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/permutation-hugepages.svg b/content/english/hpc/cpu-cache/img/permutation-hugepages.svg
new file mode 100644
index 00000000..2a3feb42
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/permutation-hugepages.svg
@@ -0,0 +1,1184 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+   
+    
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/permutation-mlp.svg b/content/english/hpc/cpu-cache/img/permutation-mlp.svg
new file mode 100644
index 00000000..75056a8a
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/permutation-mlp.svg
@@ -0,0 +1,1353 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+   
+    
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/permutation-padded.svg b/content/english/hpc/cpu-cache/img/permutation-padded.svg
new file mode 100644
index 00000000..c3dae3be
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/permutation-padded.svg
@@ -0,0 +1,1278 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+    
+   
+   
+    
+    
+     
+    
+    
+     
+     
+    
+   
+   
+    
+    
+     
+     
+     
+     
+     
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+   
+    
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/ram.png b/content/english/hpc/cpu-cache/img/ram.png
new file mode 100644
index 00000000..b4566184
Binary files /dev/null and b/content/english/hpc/cpu-cache/img/ram.png differ
diff --git a/content/english/hpc/cpu-cache/img/soa-hugepages.svg b/content/english/hpc/cpu-cache/img/soa-hugepages.svg
new file mode 100644
index 00000000..e7a669ca
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/soa-hugepages.svg
@@ -0,0 +1,1261 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+     
+     
+     
+     
+     
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+   
+    
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+    
+     
+    
+    
+    
+     
+     
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/strides-small.svg b/content/english/hpc/cpu-cache/img/strides-small.svg
new file mode 100644
index 00000000..0b76d21d
--- /dev/null
+++ b/content/english/hpc/cpu-cache/img/strides-small.svg
@@ -0,0 +1,1463 @@
+
+
+
+
+ 
+  
+ 
+ 
+  
+   
+  
+  
+   
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+      
+     
+     
+      
+      
+       
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+      
+     
+     
+      
+      
+       
+       
+       
+      
+     
+    
+    
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+     
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+      
+     
+    
+   
+   
+    
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+   
+   
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+    
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+     
+    
+   
+  
+ 
+ 
+  
+   
+  
+ 
+
diff --git a/content/english/hpc/cpu-cache/img/strides-two.svg b/content/english/hpc/cpu-cache/img/strides-two.svg
deleted file mode 100644
index 8200a958..00000000
--- a/content/english/hpc/cpu-cache/img/strides-two.svg
+++ /dev/null
@@ -1,1294 +0,0 @@
-
-
-
-
- 
-  
- 
- 
-  
-   
-  
-  
-   
-    
-   
-   
-    
-     
-      
-     
-     
-      
-      
-       
-       
-      
-      
-       
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-      
-      
-       
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-      
-      
-       
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-      
-      
-       
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-      
-      
-       
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-      
-      
-       
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-      
-      
-       
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-      
-      
-       
-       
-       
-       
-      
-     
-    
-    
-     
-     
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-     
-     
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-     
-    
-   
-   
-    
-     
-      
-     
-     
-      
-      
-       
-      
-      
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-       
-       
-      
-     
-    
-    
-     
-      
-     
-     
-      
-      
-       
-       
-       
-      
-     
-    
-    
-     
-     
-      
-      
-      
-      
-      
-      
-     
-     
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-      
-     
-    
-   
-   
-    
-     
-    
-    
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-    
-   
-   
-    
-   
-   
-    
-   
-   
-    
-   
-   
-    
-   
-   
-    
-    
-     
-     
-     
-     
-     
-     
-     
-     
-    
-    
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-     
-    
-   
-  
- 
- 
-  
-   
-  
- 
-
diff --git a/content/english/hpc/cpu-cache/latency.md b/content/english/hpc/cpu-cache/latency.md
index 37867121..4f787595 100644
--- a/content/english/hpc/cpu-cache/latency.md
+++ b/content/english/hpc/cpu-cache/latency.md
@@ -1,20 +1,21 @@
 ---
 title: Memory Latency
-weight: 1
+weight: 2
 ---
 
-Despite bandwidth — how many data one can load — is a more complicated concept, it is much easier to observe and measure than latency — how much time it takes to load one cache line.
+Despite that [bandwidth](../bandwidth) is a more complicated concept, it is much easier to observe and measure than latency: you can simply execute a long series of independent read or write queries, and the scheduler, having access to them in advance, reorders and overlaps them, hiding their latency and maximizing the total throughput.
 
-Measuring memory bandwidth is easy because the CPU can simply queue up multiple iterations of data-parallel loops like the one above. The scheduler gets access to the needed memory locations far in advance and can dispatch read requests in a way that will overlap all memory operations, hiding the latency.
-
-To measure latency, we need to design an experiment where the CPU can't cheat by knowing the memory location in advance. We can do this like this: generate a random permutation of size $n$ that corresponds a full cycle, and then repeatedly follow the permutation.
+To measure *latency*, we need to design an experiment where the CPU can't cheat by knowing the memory locations we will request in advance. One way to ensure this is to generate a random permutation of size $N$ that corresponds to a cycle and then repeatedly follow the permutation:
 
 ```cpp
 int p[N], q[N];
 
+// generating a random permutation
 iota(p, p + N, 0);
 random_shuffle(p, p + N);
 
+// this permutation may contain multiple cycles,
+// so instead we use it to construct another permutation with a single cycle
 int k = p[N - 1];
 for (int i = 0; i < N; i++)
     k = q[k] = p[i];
@@ -24,30 +25,71 @@ for (int t = 0; t < K; t++)
         k = q[k];
 ```
 
-This performance anti-pattern is known as *pointer chasing*, and it very frequent in software, especially written in high-level languages. Iterating an array this way is considerably slower.
+Compared to linear iteration, it is *much* slower — by multiple orders of magnitude — to visit all elements of an array this way. Not only does it make [SIMD](/hpc/simd) impossible, but it also [stalls the pipeline](/hpc/pipelining), creating a large traffic jam of instructions, all waiting for a single piece of data to be fetched from the memory.
+
+This performance anti-pattern is known as *pointer chasing*, and it is very frequent in data structures, especially those written high-level languages that use lots of heap-allocated objects and pointers to them necessary for dynamic typing.
+
+![](../img/latency-throughput.svg)
+
+When talking about latency, it makes more sense to use cycles or nanoseconds rather than throughput units, so we replace this graph with its reciprocal:
 
 ![](../img/permutation-latency.svg)
 
-When speaking of latency, it makes more sense to use cycles or nanoseconds rather than bandwidth units. So we will replace this graph with its reciprocal:
+Note that the cliffs on both graphs aren't as distinctive as they were for the bandwidth. This is because we still have some chance of hitting the previous layer of cache even if the array can't fit into it entirely.
 
-![](../img/latency-throughput.svg)
+### Theoretical Latency
 
-It is generally *much* slower — by multiple orders of magnitude — to iterate an array this way. Not only because it makes SIMD practically impossible, but also because it stalls the pipeline a lot.
+More formally, if there are $k$ levels in the cache hierarchy with sizes $s_i$ and latencies $l_i$, then, instead of being equal to the slowest access, their expected latency will be:
 
-### Latency of RAM and TLB
+$$
+E[L] = \frac{
+      s_1 \cdot l_1
+    + (s_2 - s_1) \cdot l_2
+%    + (s_3 - s_2) \cdot l_3
+    + \ldots
+    + (N - s_k) \cdot l_{RAM}
+    }{N}
+$$
 
-Similar to bandwidth, the latency of CPU cache scales with its clock frequency, while the RAM lives on its own fixed-frequency clock, and its performance is therefore usually measured in nanoseconds. We can observe this difference if we change the frequency by turning turbo boost on.
+If we abstract away from all that happens before the slowest cache layer, we can reduce the formula to just this:
 
-![](../img/permutation-boost.svg)
+$$
+E[L] = \frac{N \cdot l_{last} - C}{N} = l_{last} - \frac{C}{N}
+$$
 
-The graph starts making a bit more sense if we look at the relative speedup instead.
+As $N$ increases, the expected latency slowly approaches $l_{last}$, and if you squint hard enough, the graph of the throughput (reciprocal latency) should roughly look like if it is composed of a few transposed and scaled hyperbolas:
 
-![](../img/permutation-boost-speedup.svg)
+$$
+\begin{aligned}
+E[L]^{-1} &= \frac{1}{l_{last} - \frac{C}{N}}
+\\        &= \frac{N}{N \cdot l_{last} - C}
+\\        &= \frac{1}{l_{last}} \cdot \frac{N + \frac{C}{l_{last}} - \frac{C}{l_{last}}}{N - \frac{C}{l_{last}}}
+\\        &= \frac{1}{l_{last}} \cdot \left(\frac{1}{N \cdot \frac{l_{last}}{C} - 1} + 1\right)
+\\        &= \frac{1}{k \cdot (x - x_0)} + y_0
+\end{aligned}
+$$
+
+To get the actual latency numbers, we can iteratively apply the first formula to deduce $l_1$, then $l_2$, and so on. Or just look at the values right before the cliff — they should be within 10-15% of the true latency.
 
-You would expect 2x rates for array sizes that fit into CPU cache entirely, but then roughly equal for arrays stored in RAM. But this is not quite what is happening: there is a small, fixed-latency delay on lower clocked run even for RAM accesses. This happens because the CPU first checks its cache before dispatching a read query to the main memory — to save RAM bandwidth for other processes that potentially need it.
+There are more direct ways to measure latency, including the use of [non-temporal reads](../bandwidth), but this benchmark is more representable of practical access patterns.
 
-Actually, TLB misses may stall memory reads for the same reason. The TLB cache is called "lookaside" because the lookup can happen independently from normal data cache lookups. L1 and L2 caches on the other side are private to the core, and so they can store virtual addresses and be queried concurrently with TLB — after fetching a cache line, its tag is used to restore the physical address, which is then checked against the concurrently fetched TLB entry. This trick does not work for shared memory however, because their bandwidth is limited, and dispatching read queries there for no reason is not a good idea in general. So we can observe a similar effect in L3 and RAM reads when the page does not fit L1 TLB and L2 TLB respectively.
+
+
+### Frequency Scaling
+
+Similar to bandwidth, the latency of all CPU caches proportionally scales with its clock frequency, while the RAM does not. We can also observe this difference if we change the frequency by turning turbo boost on.
+
+![](../img/permutation-boost.svg)
+
+The graph starts making more sense if we plot it as a relative speedup.
+
+![](../img/permutation-boost-speedup.svg)
 
-For sparse reads, it often makes sense to increase page size, which improves the latency.
+You would expect 2x rates for array sizes that fit into CPU cache entirely, but then roughly equal for arrays stored in RAM. But this is not quite what is happening: there is a small, fixed-latency delay on lower clocked run even for RAM accesses. This happens because the CPU first has to check its cache before dispatching a read query to the main memory — to save RAM bandwidth for other processes that potentially need it.
 
-It is possible, but quite tedious to also construct an experiment actually measuring all this — so you will have to take my word on that one.
+Memory latency is also slightly affected by some details of the [virtual memory implementation](../paging) and [RAM-specific timings](../mlp), which we will discuss later.
diff --git a/content/english/hpc/cpu-cache/mlp.md b/content/english/hpc/cpu-cache/mlp.md
index 52c331c5..95dfa4cb 100644
--- a/content/english/hpc/cpu-cache/mlp.md
+++ b/content/english/hpc/cpu-cache/mlp.md
@@ -1,7 +1,59 @@
 ---
 title: Memory-Level Parallelism
-weight: 3
-draft: true
+weight: 5
 ---
 
-...
+Memory requests can overlap in time: while you wait for a read request to complete, you can send a few others, which will be executed concurrently with it. This is the main reason why [linear iteration](../bandwidth) is so much faster than [pointer jumping](../latency): the CPU knows which memory locations it needs to fetch next and sends memory requests far ahead of time.
+
+The number of concurrent memory operations is large but limited, and it is different for different types of memory. When designing algorithms and especially data structures, you may want to know this number, as it limits the amount of parallelism your computation can achieve.
+
+To find this limit theoretically for a specific memory type, you can multiply its latency (time to fetch a cache line) by its bandwidth (number of cache lines fetched per second), which gives you the average number of memory operations in progress:
+
+![](../img/latency-bandwidth.svg)
+
+The latency of the L1/L2 caches is small, so there is no need for a long pipeline of pending requests, but larger memory types can sustain up to 25-40 concurrent read operations.
+
+### Direct Experiment
+
+Let's try to measure available memory parallelism more directly by modifying our pointer chasing benchmark so that we loop around $D$ separate cycles in parallel instead of just one: 
+
+```c++
+const int M = N / D;
+int p[M], q[D][M];
+
+for (int d = 0; d < D; d++) {
+    iota(p, p + M, 0);
+    random_shuffle(p, p + M);
+    k[d] = p[M - 1];
+    for (int i = 0; i < M; i++)
+        k[d] = q[d][k[d]] = p[i];
+}
+
+for (int i = 0; i < M; i++)
+    for (int d = 0; d < D; d++)
+        k[d] = q[d][k[d]];
+```
+
+Fixing the sum of the cycle lengths constant at a few select sizes and trying different $D$, we get slightly different results:
+
+![](../img/permutation-mlp.svg)
+
+The L2 cache run is limited by ~6 concurrent operations, as predicted, but larger memory types all max out between 13 and 17. You can't make use of more memory lanes as there is a conflict over logical registers. When the number of lanes is fewer than the number of registers, you can issue just one read instruction per lane:
+
+```nasm
+dec     edx
+movsx   rdi, DWORD PTR q[0+rdi*4]
+movsx   rsi, DWORD PTR q[1048576+rsi*4]
+movsx   rcx, DWORD PTR q[2097152+rcx*4]
+movsx   rax, DWORD PTR q[3145728+rax*4]
+jne     .L9
+```
+
+But when it is over ~15, you have to use temporary memory storage:
+
+```nasm
+mov     edx, DWORD PTR q[0+rdx*4]
+mov     DWORD PTR [rbp-128+rax*4], edx
+```
+
+You don't always get to the maximum possible level of memory parallelism, but for most applications, a dozen concurrent requests are more than enough.
diff --git a/content/english/hpc/cpu-cache/packing.md b/content/english/hpc/cpu-cache/packing.md
deleted file mode 100644
index 201f0eac..00000000
--- a/content/english/hpc/cpu-cache/packing.md
+++ /dev/null
@@ -1,46 +0,0 @@
----
-title: Data Packing
-weight: 10
----
-
-If you know what you are doing, you can turn disable padding and instead pack you data structure as tight as possible. This is done
-
-When loading it though, the
-
-```cpp
-struct __attribute__ ((packed)) Data {
-    char a;
-    short b;
-    int c;
-    char d;
-};
-```
-
-This is a less standardized feature, but you can also use it with *bit fields* to members of less than fixed size.
-
-```cpp
-struct __attribute__ ((packed)) Data {
-    char a;     // 1 byte
-    int b : 24; // 3 bytes
-};
-```
-
-The structure takes 4 bytes when packed and 8 bytes when padded. This feature is not so widespread because CPUs don't have 3-byte arithmetic and has to do some inefficient conversion during loading:
-
-```cpp
-int load(char *p) {
-    char x = p[0], y = p[1], z = p[2];
-    return (x << 16) + (y << 8) + z;
-}
-```
-
-This can be optimized by loading a 4-byte `int` and then using a mask to discard its highest bits.
-
-```cpp
-int load(int *p) {
-    int x = *p;
-    return x & ((1<<24) - 1);
-}
-```
-
-Compilers usually don't do that, because this is not technically legal sometimes: may not own that 4th byte, and won't let you load it even if you are discarding it.
diff --git a/content/english/hpc/cpu-cache/paging.md b/content/english/hpc/cpu-cache/paging.md
index f0141b81..8320d437 100644
--- a/content/english/hpc/cpu-cache/paging.md
+++ b/content/english/hpc/cpu-cache/paging.md
@@ -1,9 +1,9 @@
 ---
 title: Memory Paging
-weight: 7
+weight: 12
 ---
 
-Let's consider other possible values of $D$ and try to measure loop performance. Since for values larger than 16 we will skip some cache lines altogether, requiring less memory reads and fewer cache, we change the size of the array so that the total number of cache lines fetched is constant.
+Consider [yet again](../associativity) the strided incrementing loop:
 
 ```cpp
 const int N = (1 << 13);
@@ -13,24 +13,93 @@ for (int i = 0; i < D * N; i += D)
     a[i] += 1;
 ```
 
-All we change now is the stride, and $N$ remains constant. It is equal to $2^{13}$ cache lines or $2^{13} \cdot 2^6 = 2^{19}$ bytes, precisely so that the entire addressable array can fit into L2 cache, regardless of step size. The graph should look flat, but this is not what happens.
+We change the stride $D$ and increase the array size proportionally so that the total number of iterations $N$ remains constant. As the total number of memory accesses also remains constant, for all $D \geq 16$, we should be fetching exactly $N$ cache lines — or $64 \cdot N = 2^6 \cdot 2^{13} = 2^{19}$ bytes, to be exact. This precisely fits into the L2 cache, regardless of the step size, and the throughput graph should look flat.
+
+This time, we consider a larger range of $D$ values, up to 1024. Starting from around 256, the graph is definitely not flat:
 
 ![](../img/strides.svg)
 
-This anomaly is due to the cache system, but the standard L1-L3 data caches have nothing to do with it. *Memory paging* is at fault, in particular the type of cache called *translation lookaside buffer* (TLB) that is responsible for retrieving the physical addresses of 4K memory pages of virtual memory.
+This anomaly is also due to the cache system, although the standard L1-L3 data caches have nothing to do with it. [Virtual memory](/hpc/external-memory/virtual) is at fault, in particular the *translation lookaside buffer* (TLB), which is a cache responsible for retrieving the physical addresses of the virtual memory pages.
+
+On [my CPU](https://en.wikichip.org/wiki/amd/microarchitectures/zen_2), there are two levels of TLB:
+
+- The L1 TLB has 64 entries, and if the page size is 4K, then it can handle $64 \times 4K = 512K$ of active memory without going to the L2 TLB.
+- The L2 TLB has 2048 entries, and it can handle $2048 \times 4K = 8M$ of memory without going to the page table.
+
+How much memory is allocated when $D$ becomes equal to 256? You've guessed it: $8K \times 256 \times 4B = 8M$, exactly the limit of what the L2 TLB can handle. When $D$ gets larger than that, some requests start getting redirected to the main page table, which has a large latency and very limited throughput, which bottlenecks the whole computation.
+
+### Changing Page Size
 
-On our CPU, there is not one, but two layers of TLB cache. L1 TLB can house 64 entries for a total $64 \times 4K = 512K$ of data, and L2 TLB has 2048 entries for a total of $2048 \times 4K = 8M$, which is — surprise-surprise — exactly the array size at the point where the cliff starts ($8K \times 256 \times 4B = 8M$). You can fetch this information for your architecture with `cpuid` command.
+That 8MB of slowdown-free memory seems like a very tight restriction. While we can't change the characteristics of the hardware to lift it, we *can* increase the page size, which would in turn reduce the pressure on the TLB capacity.
+
+Modern operating systems allow us to set the page size both globally and for individual allocations. CPUs only support a defined set of page sizes — mine, for example, can use either 4K or 2M pages. Another typical page size is 1G — it is usually only relevant for server-grade hardware with hundreds of gigabytes of RAM. Anything over the default 4K is called *huge pages* on Linux and *large pages* on Windows.
+
+On Linux, there is a special system file that governs the allocation of huge pages. Here is how to make the kernel give you huge pages on every allocation:
+
+```bash
+$ echo always > /sys/kernel/mm/transparent_hugepage/enabled
+```
 
-This is a huge issue, as such access patterns when we need to jump large distances are actually quite common in real programs too. Why not just make page size larger? This reduces granularity of system memory allocation — increasing fragmentation. Paging is implemented both on software (OS) and hardware level, and modern operating systems actually give us freedom in choosing page size on demand. You can read more on madvise if you are interested, but for our benchmarks we will just turn on huge pages for all allocations by default like this:
+Enabling huge pages globally like this isn't always a good idea because it decreases memory granularity and raises the minimum memory that a process consumes — and some environments have more processes than free megabytes of memory. So, in addition to `always` and `never`, there is a third option in that file:
 
 ```bash
-echo always >/sys/kernel/mm/transparent_hugepage/enabled
+$ cat /sys/kernel/mm/transparent_hugepage/enabled
+always [madvise] never
+```
+
+`madvise` is a special system call that lets the program advise the kernel on whether to use huge pages or not, which can be used for allocating huge pages on-demand. If it is enabled, you can use it in C++ like this:
+
+```c++
+#include 
+
+void *ptr = std::aligned_alloc(page_size, array_size);
+madvise(ptr, array_size, MADV_HUGEPAGE);
 ```
 
-This flattens the curve:
+You can only request a memory region to be allocated using huge pages if it has the corresponding alignment.
+
+Windows has similar functionality. Its memory API combines these two functions into one:
+
+```c++
+#include "memoryapi.h"
+
+void *ptr = VirtualAlloc(NULL, array_size,
+                         MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE);
+```
+
+In both cases, `array_size` should be a multiple of `page_size`.
+
+### Impact of Huge Pages
+
+Both variants of allocating huge pages immediately flatten the curve:
 
 ![](../img/strides-hugepages.svg)
 
+Enabling huge pages also improves [latency](../latency) by up to 10-15% for arrays that don't fit into the L2 cache:
+
+![](../img/permutation-hugepages.svg)
+
+In general, enabling huge pages is a good idea when you have any sort of sparse reads, as they usually slightly improve and ([almost](../aos-soa)) never hurt performance.
+
+That said, you shouldn't rely on huge pages if possible, as they aren't always available due to either hardware or computing environment restrictions. There are [many](../cache-lines) [other](../prefetching) [reasons](../aos-soa) why grouping data accesses spatially may be beneficial, which automatically solves the paging problem.
+
+
+
diff --git a/content/english/hpc/cpu-cache/pointers.md b/content/english/hpc/cpu-cache/pointers.md
index 5b46d97d..a4e3857a 100644
--- a/content/english/hpc/cpu-cache/pointers.md
+++ b/content/english/hpc/cpu-cache/pointers.md
@@ -1,12 +1,28 @@
 ---
 title: Pointer Alternatives
-weight: 12
+weight: 10
 ---
 
+In the [pointer chasing benchmark](../latency), for simplicity, we didn't use actual pointers, but integer indices relative to a base address:
 
-Memory addressing operator is fused on x86, so `k = q[k]` folds into one terse `mov rax, DWORD PTR q[0+rax*4]` instruction, although it does a multiplication by 4 and an addition under the hood. Although fully fused, These additional computations actually add some delay to memory operations, and in fact the latency of L1 fetch is 4 or 5 cycles — the latter being the case if we need to perform complex computation of address. For this reason, the permutation benchmark measures 3ns or 6 cycles per fetch: 5 for the read (including +1 for address computation) and 1 to move the result to the right register.
+```c++
+for (int i = 0; i < N; i++)
+    k = q[k];
+```
+
+[The memory addressing operator](/hpc/architecture/assembly#addressing-modes) on x86 is fused with the address computation, so the `k = q[k]` line folds into just a single terse instruction that also does multiplication by 4 and addition under the hood:
+
+```nasm
+mov rax, DWORD PTR q[0+rax*4]
+```
+
+Although fully fused, these additional computations add some delay to memory operations. The latency of an L1 fetch is either 4 or 5 cycles — the latter being the case if we need to perform a complex computation of the address. For this reason, the permutation benchmark measures 3ns or 6 cycles per jump: 4+1 for the read and address computation and another one to move the result to the right register.
 
-We can make our benchmark run slightly faster if we replace "fake pointers" — indices — with actual pointers. There are some syntactical issues in getting "pointer to pointer to pointer…" constructions to work, so instead we will define a struct type that just wraps a pointers to its own kind — this is how most pointer chasing works anyway:
+### Pointers
+
+We can make our benchmark run slightly faster if we replace "fake pointers" — indices — with actual pointers.
+
+There are some syntactical issues in getting "pointer to pointer to pointer…" constructions to work, so instead we will define a struct that just wraps a pointers to its own type — this is how most pointer chasing works anyway:
 
 ```cpp
 struct node { node* ptr; };
@@ -24,23 +40,27 @@ for (int i = 0; i < N; i++)
     k = k->ptr;
 ```
 
-This code now runs in 2ns / 4 cycles for arrays that fit in L1 cache. Why not 4+1=5? Because Zen 2 [has an interesting feature](https://www.agner.org/forum/viewtopic.php?t=41) that allows zero-latency reuse of data accessed just by address, so the "move" here is transparent, resulting in whole 2 cycles saved.
+This code now runs in 2ns / 4 cycles for arrays that fit in the L1 cache. Why not 4+1=5? Because Zen 2 [has an interesting feature](https://www.agner.org/forum/viewtopic.php?t=41) that allows zero-latency reuse of data accessed just by address, so the "move" here is transparent, resulting in whole two cycles saved.
 
-Unfortunately, there is a problem with it on 64-bit systems as the pointers become twice as large, making the array spill out of cache much sooner compared to using a 32-bit index. Graph looks like if it was shifted by one power of two to the left — exactly like it should.
+Unfortunately, there is a problem with it on 64-bit systems as the pointers become twice as large, making the array spill out of cache much sooner compared to using a 32-bit index. The latency-versus-size graph looks like if it was shifted by one power of two to the left — exactly like it should:
 
 ![](../img/permutation-p64.svg)
 
-This problem is mitigated by switching to 32-bit mode. You need to go [through some trouble](https://askubuntu.com/questions/91909/trouble-compiling-a-32-bit-binary-on-a-64-bit-machine) getting 32-bit libs to get this running on a computer made in this century, but this is justified by the result — unless you also need to interoperate with 64-bit software or access more than 4G or RAM.
+This problem is mitigated by switching to the 32-bit mode:
 
 ![](../img/permutation-p32.svg)
 
-The fact that on larger problem sizes the performance is bottlenecked by memory rather than CPU lets us to try something even more stranger: using less than 4 bytes for storing indices. This can be done with bit fields:
+You need to go [through some trouble](https://askubuntu.com/questions/91909/trouble-compiling-a-32-bit-binary-on-a-64-bit-machine) getting 32-bit libs to get this running on a computer made in this century, but this shouldn't pose other problems unless you need to interoperate with 64-bit software or access more than 4G of RAM
+
+### Bit Fields
+
+The fact that on larger problem sizes the performance is bottlenecked by memory rather than CPU lets us try something even more strange: we can use less than 4 bytes for storing indices. This can be done with [bit fields](../alignment#bit-fields):
 
 ```cpp
 struct __attribute__ ((packed)) node { int idx : 24; };
 ```
 
-You don't need to do anything other than defining a structure for the bit field. The CPU does truncation by itself.
+You don't need to do anything else other than defining a structure for the bit field — the compiler handles the 3-byte integer all by itself:
 
 ```cpp
 int k = p[N - 1];
@@ -52,7 +72,7 @@ for (int i = 0; i < N; i++) {
     k = q[k].idx;
 ```
 
-This measures at 6.5ns in the L1 cache, but the conversion procedure chosen by the compiler is suboptimal: it is done by loading 3 bytes, which is not optimal. Instead, we could just load a 4-byte integer and truncate it ourselves (we also need to add one more element to the `q` array to ensure we own that extra one byte of memory):
+This code measures at 6.5ns for the L1 cache. There is some room for improvement as the default conversion procedure chosen by the compiler is suboptimal. We could manually load a 4-byte integer and truncate it ourselves (we also need to add one more element to the `q` array to ensure we own that extra one byte of memory):
 
 ```cpp
 k = *((int*) (q + k));
@@ -63,4 +83,6 @@ It now runs in 4ns, and produces the following graph:
 
 ![](../img/permutation-bf-custom.svg)
 
-In short: for something very small, use pointers; for something very large, use bit fields.
+If you zoom close enough ([the graph is an svg](../img/permutation-bf-custom.svg)), you'll see that the pointers win on very small arrays, then starting from around the L2-L3 cache boundary our custom bit fields take over, and for very large arrays it doesn't matter because we never hit cache anyway.
+
+This isn't a kind of optimization that can give you a 5x improvement, but it's still something to try when all the other resources are exhausted.
diff --git a/content/english/hpc/cpu-cache/prefetching.md b/content/english/hpc/cpu-cache/prefetching.md
new file mode 100644
index 00000000..4f5a7545
--- /dev/null
+++ b/content/english/hpc/cpu-cache/prefetching.md
@@ -0,0 +1,120 @@
+---
+title: Prefetching
+weight: 6
+---
+
+Taking advantage of the [free concurrency](../mlp) available in memory hardware, it can be beneficial to *prefetch* data that is likely to be accessed next if its location can be predicted. This is easy to do when there are no [data of control hazards](/hpc/pipelining/hazards) in the pipeline and the CPU can just run ahead of the instruction stream and execute memory operations out of order.
+
+But sometimes the memory locations aren't in the instruction stream, and yet they can still be predicted with high probability. In these cases, they can be prefetched by other means:
+
+- Explicitly, by separately reading the next data word or any of the bytes in the same cache line, so that it is lifted in the cache hierarchy.
+- Implicitly, by using simple access patterns such as linear iteration, which are detectable by the memory hardware that can start prefetching automatically.
+
+Hiding memory latency is crucial for achieving performance, so in this section, we will look into prefetching techniques.
+
+### Hardware Prefetching
+
+Let's modify the [pointer chasing](../latency) benchmark to show the effect of hardware prefetching. Now, we generate our permutation in a way that makes the CPU request consecutive cache lines when iterating over the permutation, but still accessing the elements inside a cache line in random order:
+
+```cpp
+int p[15], q[N];
+
+iota(p, p + 15, 1);
+
+for (int i = 0; i + 16 < N; i += 16) {
+    random_shuffle(p, p + 15);
+    int k = i;
+    for (int j = 0; j < 15; j++)
+        k = q[k] = i + p[j];
+    q[k] = i + 16;
+}
+```
+
+There is no point in making a graph because it would be just flat: the latency is 3ns regardless of the array size. Even though the instruction scheduler still can't tell what we are going to fetch next, the memory prefetcher can detect a pattern just by looking at the memory accesses and start loading the next cache line ahead of time, mitigating the latency.
+
+Hardware prefetching is smart enough for most use cases, but it only detects simple patterns. You can iterate forward and backward over multiple arrays in parallel, perhaps with small-to-medium strides, but that's about it. For anything more complex, the prefetcher won't figure out what's happening, and we need to help it out ourselves.
+
+### Software Prefetching
+
+The simplest way to do software prefetching is to load any byte in the cache line with the `mov` or any other memory instruction, but CPUs have a separate `prefetch` instruction that lifts a cache line without doing anything with it. This instruction isn't a part of the C or C++ standard, but is available in most compilers as the `__builtin_prefetch` intrinsic:
+
+```c++
+__builtin_prefetch(&a[k]);
+```
+
+It's quite hard to come up with a *simple* example when it can be useful. To make the pointer chasing benchmark benefit from software prefetching, we need to construct a permutation that at the same time loops around the whole array, can't be predicted by hardware prefetcher, and has easily computable next addresses.
+
+Luckily, the [linear congruential generator](https://en.wikipedia.org/wiki/Linear_congruential_generator) has the property that if the modulus $n$ is a prime number, then the period of the generator will be exactly $n$. So we get all the properties we need if we use a permutation generated by the LCG with the current index as its state:
+
+```cpp
+const int n = find_prime(N); // largest prime not exceeding N
+
+for (int i = 0; i < n; i++)
+    q[i] = (2 * i + 1) % n;
+```
+
+When we run it, the performance matches a normal random permutation. But now we get the ability to peek ahead:
+
+```cpp
+int k = 0;
+
+for (int t = 0; t < K; t++) {
+    for (int i = 0; i < n; i++) {
+        __builtin_prefetch(&q[(2 * k + 1) % n]);
+        k = q[k];
+    }
+}
+```
+
+There is some overhead to computing the next address, but for arrays large enough, it is almost two times faster:
+
+![](../img/sw-prefetch.svg)
+
+Interestingly, we can prefetch more than just one element ahead, making use of this pattern in the LCG function:
+
+$$
+\begin{aligned}
+   f(x)   &= 2 \cdot x + 1
+\\ f^2(x) &= 4 \cdot x + 2 + 1
+\\ f^3(x) &= 8 \cdot x + 4 + 2 + 1
+\\ &\ldots
+\\ f^k(x) &= 2^k \cdot x + (2^k - 1)
+\end{aligned}
+$$
+
+Hence, to load the `D`-th element ahead, we can do this:
+
+```cpp
+__builtin_prefetch(&q[((1 << D) * k + (1 << D) - 1) % n]);
+```
+
+If we execute this request on every iteration, we will be simultaneously prefetching `D` elements ahead on average, increasing the throughput by `D` times. Ignoring some issues such as the integer overflow when `D` is too large, we can reduce the average latency arbitrarily close to the cost of computing the next index (which, in this case, is dominated by the [modulo operation](/hpc/arithmetic/division)).
+
+![](../img/sw-prefetch-others.svg)
+
+Note that this is an artificial example, and you actually fail more often than not when trying to insert software prefetching into practical programs. This is largely because you need to issue a separate memory instruction that may compete for resources with the others. At the same time, hardware prefetching is 100% harmless as it only activates when the memory and cache buses are not busy.
+
+You can also specify a specific level of cache the data needs to be brought to when doing software prefetching — when you aren't sure if you will be using it and don't want to kick out what is already in the L1 cache. You can use it with the `_mm_prefetch` intrinsic, which takes an integer value as the second parameter, specifying the cache level. This is useful in combination with [non-temporal loads and stores](../bandwidth#bypassing-the-cache).
+
+
diff --git a/content/english/hpc/cpu-cache/sharing.md b/content/english/hpc/cpu-cache/sharing.md
index c9341822..f3d3e23f 100644
--- a/content/english/hpc/cpu-cache/sharing.md
+++ b/content/english/hpc/cpu-cache/sharing.md
@@ -1,46 +1,57 @@
 ---
 title: Memory Sharing
-weight: 4
+weight: 3
 ---
 
-Starting from a certain level in the hierarchy, cache becomes shared between different cores. This limits the size and bandwidth of the cache, reducing performance in case of parallel algorithms or just noisy neighbors.
+Starting at some level of the hierarchy, the cache becomes *shared* between different cores. This reduces the total die area and lets you add more cores on a single chip but also poses some "noisy neighbor" problems as it limits the effective cache size and bandwidth available to a single execution thread.
 
-On my machine, there is actually not 4M, but 8M of L3 cache, but it is shared between groups of 4 cores so that each core "sees" only 4M that is shared with 3 other cores — and, of course, all the cores have uniform access to RAM. There may be more complex situations, especially in the case of multi-socket and NUMA architectures. The "topology" of the cache system can be retrieved with the `lstopo` utility.
+On most CPUs, only the last layer of cache is shared, and not always in a uniform manner. On my machine, there are 8 physical cores, and the size of the L3 cache is 8M, but it is split into two halves: two groups of 4 cores have access to their own 4M region of the L3 cache, and not all of it.
 
-![Cache hierarchy scheme generated by lstopo command on Linux](../img/lstopo.png)
+There are even more complex topologies, where accessing certain regions of memory takes non-constant time, different for each core (which is [sometimes unintended](https://randomascii.wordpress.com/2022/01/12/5-5-mm-in-1-25-nanoseconds/)). Such architectural feature is called *non-uniform memory access* (NUMA), and it is the case for multi-socket systems that have several separate CPU chips installed.
 
-This has some very important implications for certain parallel algorithms:
+On Linux, the topology of the memory system can be retrieved with `lstopo`:
 
-- If and algorithm is memory-bound, then it doesn't matter how much cores you add, as it will be bottlenecked by the RAM bandwidth.
-- On non-uniform architectures, it matters which cores are running which execution threads.
+![Cache hierarchy of my Ryzen 7 4700U generated by lstopo](../img/lstopo.png)
 
-To show this, we can run the same benchmarks in parallel. Instead of changing source code to run multiple threads, we can make use of GNU parallel. Due to the asymmetry `taskset` to manage CPU affinity and set them to the first "half" of cores (to temporary ignore the second issue).
+This has some important implications for parallel algorithms: the performance of multi-threaded memory accesses depends on which cores are running which execution threads. To demonstrate this, we will run the [bandwidth benchmarks](../bandwidth) in parallel.
+
+### CPU Affinity
+
+Instead of modifying the source code to run on multiple threads, we can simply run multiple identical processes with [GNU parallel](https://www.gnu.org/software/parallel/). To control which cores are executing them, we set their *processor affinity* with `taskset`. This combined command runs 4 processes that can run on the first 4 cores of the CPU:
 
 ```bash
 parallel taskset -c 0,1,2,3 ./run ::: {0..3}
 ```
 
-You can now see that the L3 effects diminishes with more cores competing for it, and after falling into the RAM region the total performance remains constant.
+Here is what we get when we change the number of processes running simultaneously:
 
 ![](../img/parallel.svg)
 
-TODO: note about RAM
+You can now see that the performance decreases with more processes when the array exceeds the L2 cache (which is private to each core), as the cores start competing for the shared L3 cached and the RAM.
 
-This asymmetry makes it important to manage where exactly different threads should be running. By default, the operating systems knows nothing about affinity, so it assigns threads to cores arbitrarily and dynamically during execution, based on core load and job priority, and settings of the scheduler. This can be affected directly, which is what we did with `taskset` to restrict the available cores to the first half that share the same 4M region of L3.
+We specifically set all processes to run on the first 4 cores because they have a unified L3 cache. If some of the processes were to be scheduled on the other half of the cores, there would be less contention for the L3 cache. The operating system doesn't [monitor](/hpc/profiling/events) such activities — what a process does is its own private business — so by default, it assigns threads to cores arbitrarily during execution, without caring about cache affinity and only taking into account the core load.
 
-Let's add another 2-thread run, but now with running on cores in different 4-core groups that don't share L3 cache:
+Let's run another benchmark, but now with pinning the processes to different 4-core groups that don't share L3 cache:
 
 ```bash
-parallel taskset -c 0,1 ./run ::: {0..1}
-parallel taskset -c 0,4 ./run ::: {0..1}
+parallel taskset -c 0,1 ./run ::: {0..1}  # L3 cache sharing
+parallel taskset -c 0,4 ./run ::: {0..1}  # no L3 cache sharing
 ```
 
-You can see that it performs better — as if there were twice as much L3 cache available.
+It performs better — as if there were twice as much L3 cache and RAM bandwidth available:
 
 ![](../img/affinity.svg)
 
-These issues are especially tricky when benchmarking and is usually the largest source of noise in real-world applications.
+These issues are especially tricky when benchmarking and are a huge source of noise when timing parallel applications.
+
+### Saturating Bandwidth
+
+When looking at the RAM section of the first graph, it may seem that with more cores, the per-process throughput goes ½, ⅓, ¼, and so on, and the total bandwidth remains constant. But this isn't quite true: the contention hurts, but a single CPU core usually can't saturate all of the RAM bandwidth.
+
+If we plot it more carefully, we see that the total bandwidth actually increases with the number of cores — although not proportionally, and eventually approaches its theoretical maximum of ~42.4 GB/s:
+
+![](../img/parallel-bandwidth.svg)
 
-Non-uniform memory access, RAM paging
+Note that we still specify processor affinity: the $k$-threaded run uses the first $k$ cores. This is why we have such a huge performance increase when switching from 4 cores to 5: you can have more RAM bandwidth if the requests go through separate L3 caches.
 
-https://randomascii.wordpress.com/2022/01/12/5-5-mm-in-1-25-nanoseconds/
+In general, to achieve maximum bandwidth, you should always split the threads of an application symmetrically.
diff --git a/content/english/hpc/cpu-cache/sw-prefetching.md b/content/english/hpc/cpu-cache/sw-prefetching.md
deleted file mode 100644
index 2db5ce19..00000000
--- a/content/english/hpc/cpu-cache/sw-prefetching.md
+++ /dev/null
@@ -1,52 +0,0 @@
----
-title: Software Prefetching
-weight: 6
----
-
-Sometimes the hardware can't figure out what to prefetch next by itself, and in this case, we need to point it explicitly.
-
-The easiest thing is to just use any byte in the cache line as an operand, but CPUs have an explicit instruction to just "lift" a cache line without doing anything with it. As far as I know, this instruction is not a part of the C/C++ standard or any other language, but is widely available in compilers.
-
-It turned out it is non-trivial to design such a permutation case that simultaneously loops around all the array, can't be predicted by hardware prefetching but the next address is easily computable in order to do prefetching.
-
-Luckily, LCG can be used. It is a known property that if ..., then the period will be exactly $n$. So, we will modify our algorithm so that the permutation is generated by LCG, using current index as the state:
-
-```cpp
-const int n = find_prime(N);
-
-for (int i = 0; i < n; i++)
-    q[i] = (2 * i + 1) % n;
-```
-
-Running it, the performance is the same as with the fully random permutation. But now we have the capability of peeking a bit ahead:
-
-```cpp
-int k = 0;
-
-for (int t = 0; t < K; t++) {
-    for (int i = 0; i < n; i++) {
-        __builtin_prefetch(&q[(2 * k + 1) % n]);
-        k = q[k];
-    }
-}
-```
-
-It is almost 2 times faster, as we expected.
-
-![](../img/sw-prefetch.svg)
-
-Interestingly, we can cut it arbitrarily close (to the cost of computing the next index — [modulo is expensive](../arithmetic/integer)).
-
-One can show that in order to load $k$-th element ahead, we can do this:
-
-```cpp
-__builtin_prefetch(&q[((1 << D) * k + (1 << D) - 1) % n]);
-```
-
-Managing issues such as integer overflow, we can cut latency down arbitrarily close to just calculating the address using the formula.
-
-![](../img/sw-prefetch-others.svg)
-
-
\ No newline at end of file
diff --git a/content/english/hpc/data-structures/_index.md b/content/english/hpc/data-structures/_index.md
index 880b3d43..6034f8b0 100644
--- a/content/english/hpc/data-structures/_index.md
+++ b/content/english/hpc/data-structures/_index.md
@@ -1,7 +1,10 @@
 ---
 title: Data Structures Case Studies
 weight: 12
-draft: true
 ---
 
-Optimizing data structures is different from optimizing algorithms. It is harder. Each new aspect multiplies the design complexity. A lot more attention needs to be attached to memory and latency-bandwidth trade-offs.
+Optimizing data structures is different from optimizing [algorithms](/hpc/algorithms) as data structure problems have more dimensions: you may be optimizing for *throughput*, for *latency*, for *memory usage*, or any combination of those — and this complexity blows up exponentially when you need to process *multiple* query types and consider multiple query distributions.
+
+This makes simply [defining benchmarks](/hpc/profiling/noise/) much harder, let alone the actual implementations. In this chapter, we will try to navigate all this complexity and learn how to design efficient data structures with extensive case studies.
+
+A brief review of the [CPU cache system](/hpc/cpu-cache) is strongly advised.
diff --git a/content/english/hpc/data-structures/b-tree.md b/content/english/hpc/data-structures/b-tree.md
new file mode 100644
index 00000000..0189a185
--- /dev/null
+++ b/content/english/hpc/data-structures/b-tree.md
@@ -0,0 +1,399 @@
+---
+title: Search Trees
+weight: 3
+---
+
+In the [previous article](../s-tree), we designed and implemented *static* B-trees to speed up binary searching in sorted arrays. In its [last section](../s-tree/#as-a-dynamic-tree), we briefly discussed how to make them *dynamic* back while retaining the performance gains from [SIMD](/hpc/simd) and validated our predictions by adding and following explicit pointers in the internal nodes of the S+ tree.
+
+In this article, we follow up on that proposition and design a minimally functional search tree for integer keys, [achieving](#evaluation) up to 18x/8x speedup over `std::set` and up to 7x/2x speedup over [`absl::btree`](https://abseil.io/blog/20190812-btree) for `lower_bound` and `insert` queries, respectively — with yet ample room for improvement.
+
+The memory overhead of the structure is around 30% for 32-bit integers, and the final implementation is [under 150 lines of C++](https://github.com/sslotin/amh-code/blob/main/b-tree/btree-final.cc). It can be easily generalized to other arithmetic types and small/fixed-length strings such as hashes, country codes, and stock symbols.
+
+
+
+## B− Tree
+
+Instead of making small incremental improvements like we usually do in other case studies, in this article, we will implement just one data structure that we name *B− tree*, which is based on the [B+ tree](../s-tree/#b-tree-layout-1), with a few minor differences:
+
+- Nodes in the B− tree do not store pointers or any metadata except for the pointers to internal node children (while the B+ tree leaf nodes store a pointer to the next leaf node). This lets us perfectly place the keys in the leaf nodes on cache lines.
+- We define key $i$ to be the *maximum* key in the subtree of the child $i$ instead of the *minimum* key in the subtree of the child $(i + 1)$. This lets us not fetch any other nodes after we reach a leaf (in the B+ tree, all keys in the leaf node may be less than the search key, so we need to go to the next leaf node to fetch its first element).
+
+We also use a node size of $B=32$, which is smaller than typical. The reason why it is not $16$, which was [optimal for the S+ tree](../s-tree/#modifications-and-further-optimizations), is because we have the additional overhead associated with fetching the pointer, and the benefit of reducing the tree height by ~20% outweighs the cost of processing twice the elements per node, and also because it improves the running time of the `insert` query that needs to perform a costly node split every $\frac{B}{2}$ insertions on average.
+
+
+
+### Memory Layout
+
+Although this is probably not the best approach in terms of software engineering, we will simply store the entire tree in a large pre-allocated array, without discriminating between leaves and internal nodes:
+
+```c++
+const int R = 1e8;
+alignas(64) int tree[R];
+```
+
+We also pre-fill this array with infinities to simplify the implementation:
+
+```c++
+for (int i = 0; i < R; i++)
+    tree[i] = INT_MAX;
+```
+
+(In general, it is technically cheating to compare against `std::set` or other structures that use `new` under the hood, but memory allocation and initialization are not the bottlenecks here, so this does not significantly affect the evaluation.)
+
+Both nodes types store their keys sequentially in sorted order and are identified by the index of its first key in the array:
+
+- A leaf node has up to $(B - 1)$ keys but is padded to $B$ elements with infinities.
+- An internal node has up to $(B - 2)$ keys padded to $B$ elements and up to $(B - 1)$ indices of its child nodes, also padded to $B$ elements.
+
+These design decisions are not arbitrary:
+
+- The padding ensures that leaf nodes occupy exactly 2 cache lines and internal nodes occupy exactly 4 cache lines.
+- We specifically use [indices instead of pointers](/hpc/cpu-cache/pointers/) to save cache space and make moving them with SIMD faster.  
+  (We will use "pointer" and "index" interchangeably from now on.)
+- We store indices right after the keys even though they are stored in separate cache lines because [we have reasons](/hpc/cpu-cache/aos-soa/).
+- We intentionally "waste" one array cell in leaf nodes and $2+1=3$ cells in internal nodes because we need it to store temporary results during a node split.
+
+Initially, we only have one empty leaf node as the root:
+
+```c++
+const int B = 32;
+
+int root = 0;   // where the keys of the root start
+int n_tree = B; // number of allocated array cells
+int H = 1;      // current tree height
+```
+
+To "allocate" a new node, we simply increase `n_tree` by $B$ if it is a leaf node or by $2 B$ if it is an internal node. 
+
+Since new nodes can only be created by splitting a full node, each node except for the root will be at least half full. This implies that we need between 4 and 8 bytes per integer element (the internal nodes will contribute $\frac{1}{16}$-th or so to that number), the former being the case when the inserts are sequential, and the latter being the case when the input is adversarial. When the queries are uniformly distributed, the nodes are ~75% full on average, projecting to ~5.2 bytes per element.
+
+B-trees are very memory-efficient compared to the pointer-based binary trees. For example, `std::set` needs at least three pointers (the left child, the right child, and the parent), alone costing $3 \times 8 = 24$ bytes, plus at least another $8$ bytes to store the key and the meta-information due to [structure padding](/hpc/cpu-cache/alignment/).
+
+### Searching
+
+It is a very common scenario when >90% of operations are lookups, and even if this is not the case, every other tree operation typically begins with locating a key anyway, so we will start with implementing and optimizing the searches.
+
+When we implemented [S-trees](../s-tree/#optimization), we ended up storing the keys in permuted order due to the intricacies of how the blending/packs instructions work. For the *dynamic tree* problem, storing the keys in permuted order would make inserts much harder to implement, so we will change the approach instead.
+
+An alternative way to think about finding the would-be position of the element `x` in a sorted array is not "the index of the first element that is not less than `x`" but "the number of elements that are less than `x`." This observation generates the following idea: compare the keys against `x`, aggregate the vector masks into a 32-bit mask (where each bit can correspond to any element as long as the mapping is bijective), and then call `popcnt` on it, returning the number of elements less than `x`.
+
+This trick lets us perform the local search efficiently and without requiring any shuffling:
+
+```c++
+typedef __m256i reg;
+
+reg cmp(reg x, int *node) {
+    reg y = _mm256_load_si256((reg*) node);
+    return _mm256_cmpgt_epi32(x, y);
+}
+
+// returns how many keys are less than x
+unsigned rank32(reg x, int *node) {
+    reg m1 = cmp(x, node);
+    reg m2 = cmp(x, node + 8);
+    reg m3 = cmp(x, node + 16);
+    reg m4 = cmp(x, node + 24);
+
+    // take lower 16 bits from m1/m3 and higher 16 bits from m2/m4
+    m1 = _mm256_blend_epi16(m1, m2, 0b01010101);
+    m3 = _mm256_blend_epi16(m3, m4, 0b01010101);
+    m1 = _mm256_packs_epi16(m1, m3); // can also use blendv here, but packs is simpler
+
+    unsigned mask = _mm256_movemask_epi8(m1);
+    return __builtin_popcount(mask);    
+}
+```
+
+Note that, because of this procedure, we have to pad the "key area" with infinities, which prevents us from storing metadata in the vacated cells (unless we are also willing to spend a few cycles to mask it out when loading a SIMD lane).
+
+Now, to implement `lower_bound`, we can descend the tree just like we did in the S+ tree, but fetching the pointer after we compute the child number:
+
+```c++
+int lower_bound(int _x) {
+    unsigned k = root;
+    reg x = _mm256_set1_epi32(_x);
+    
+    for (int h = 0; h < H - 1; h++) {
+        unsigned i = rank32(x, &tree[k]);
+        k = tree[k + B + i];
+    }
+
+    unsigned i = rank32(x, &tree[k]);
+
+    return tree[k + i];
+}
+```
+
+Implementing search is easy, and it doesn't introduce much overhead. The hard part is implementing insertion.
+
+### Insertion
+
+On the one side, correctly implementing insertion takes a lot of code, but on the other, most of that code is executed very infrequently, so we don't have to care about its performance that much. Most often, all we need to do is to reach the leaf node (which we've already figured out how to do) and then insert a new key into it, moving some suffix of the keys one position to the right. Occasionally, we also need to split the node and/or update some ancestors, but this is relatively rare, so let's focus on the most common execution path first.
+
+To insert a key into an array of $(B - 1)$ sorted elements, we can load them in vector registers and then [mask-store](/hpc/simd/masking) them one position to the right using a [precomputed](/hpc/compilation/precalc/) mask that tells which elements need to be written for a given `i`:
+
+```c++
+struct Precalc {
+    alignas(64) int mask[B][B];
+
+    constexpr Precalc() : mask{} {
+        for (int i = 0; i < B; i++)
+            for (int j = i; j < B - 1; j++)
+                // everything from i to B - 2 inclusive needs to be moved
+                mask[i][j] = -1;
+    }
+};
+
+constexpr Precalc P;
+
+void insert(int *node, int i, int x) {
+    // need to iterate right-to-left to not overwrite the first element of the next lane
+    for (int j = B - 8; j >= 0; j -= 8) {
+        // load the keys
+        reg t = _mm256_load_si256((reg*) &node[j]);
+        // load the corresponding mask
+        reg mask = _mm256_load_si256((reg*) &P.mask[i][j]);
+        // mask-write them one position to the right
+        _mm256_maskstore_epi32(&node[j + 1], mask, t);
+    }
+    node[i] = x; // finally, write the element itself
+}
+```
+
+This [constexpr magic](/hpc/compilation/precalc/) is the only C++ feature we use.
+
+There are other ways to do it, some possibly more efficient, but we are going to stop there for now.
+
+When we split a node, we need to move half of the keys to another node, so let's write another primitive that does it:
+
+```c++
+// move the second half of a node and fill it with infinities
+void move(int *from, int *to) {
+    const reg infs = _mm256_set1_epi32(INT_MAX);
+    for (int i = 0; i < B / 2; i += 8) {
+        reg t = _mm256_load_si256((reg*) &from[B / 2 + i]);
+        _mm256_store_si256((reg*) &to[i], t);
+        _mm256_store_si256((reg*) &from[B / 2 + i], infs);
+    }
+}
+```
+
+With these two vector functions implemented, we can now very carefully implement insertion:
+
+```c++
+void insert(int _x) {
+    // the beginning of the procedure is the same as in lower_bound,
+    // except that we save the path in case we need to update some of our ancestors
+    unsigned sk[10], si[10]; // k and i on each iteration
+    //           ^------^ We assume that the tree height does not exceed 10
+    //                    (which would require at least 16^10 elements)
+    
+    unsigned k = root;
+    reg x = _mm256_set1_epi32(_x);
+
+    for (int h = 0; h < H - 1; h++) {
+        unsigned i = rank32(x, &tree[k]);
+
+        // optionally update the key i right away
+        tree[k + i] = (_x > tree[k + i] ? _x : tree[k + i]);
+        sk[h] = k, si[h] = i; // and save the path
+        
+        k = tree[k + B + i];
+    }
+
+    unsigned i = rank32(x, &tree[k]);
+
+    // we can start computing the is-full check before insertion completes
+    bool filled  = (tree[k + B - 2] != INT_MAX);
+
+    insert(tree + k, i, _x);
+
+    if (filled) {
+        // the node needs to be split, so we create a new leaf node
+        move(tree + k, tree + n_tree);
+        
+        int v = tree[k + B / 2 - 1]; // new key to be inserted
+        int p = n_tree;              // pointer to the newly created node
+        
+        n_tree += B;
+
+        for (int h = H - 2; h >= 0; h--) {
+            // ascend and repeat until we reach the root or find a the node is not split
+            k = sk[h], i = si[h];
+
+            filled = (tree[k + B - 3] != INT_MAX);
+
+            // the node already has a correct key (the right one)
+            //                  and a correct pointer (the left one)
+            insert(tree + k,     i,     v);
+            insert(tree + k + B, i + 1, p);
+            
+            if (!filled)
+                return; // we're done
+
+            // create a new internal node
+            move(tree + k,     tree + n_tree);     // move keys
+            move(tree + k + B, tree + n_tree + B); // move pointers
+
+            v = tree[k + B / 2 - 1];
+            tree[k + B / 2 - 1] = INT_MAX;
+
+            p = n_tree;
+            n_tree += 2 * B;
+        }
+
+        // if reach here, this means we've reached the root,
+        // and it was split into two, so we need a new root
+        tree[n_tree] = v;
+
+        tree[n_tree + B] = root;
+        tree[n_tree + B + 1] = p;
+
+        root = n_tree;
+        n_tree += 2 * B;
+        H++;
+    }
+}
+```
+
+There are many inefficiencies, but, luckily, the body of `if (filled)` is executed very infrequently — approximately every $\frac{B}{2}$ insertions — and the insertion performance is not really our top priority, so we will just leave it there.
+
+## Evaluation
+
+We have only implemented `insert` and `lower_bound`, so this is what we will measure.
+
+We want the evaluation to take a reasonable time, so our benchmark is a loop that alternates between two steps:
+
+- Increase the structure size from $1.17^k$ to $1.17^{k+1}$ using individual `insert`s and measure the time it took.
+- Perform $10^6$ random `lower_bound` queries and measure the time it took.
+
+We start at the size $10^4$ and end at $10^7$, for around $50$ data points in total. We generate the data for both query types uniformly in the $[0, 2^{30})$ range and independently between the stages. Since the data generation process allows for repeated keys, we compared against `std::multiset` and `absl::btree_multiset`[^absl], although we still refer to them as `std::set` and `absl::btree` for brevity. We also enable [hugepages](/hpc/cpu-cache/paging) on the system level for all three runs.
+
+[^absl]: If you also think that only comparing with Abseil's B-tree is not convincing enough, [feel free](https://github.com/sslotin/amh-code/tree/main/b-tree) to add your favorite search tree to the benchmark.
+
+
+
+The performance of the B− tree matches what we originally predicted — at least for the lookups:
+
+![](../img/btree-absolute.svg)
+
+The relative speedup varies with the structure size — 7-18x/3-8x over STL and 3-7x/1.5-2x over Abseil:
+
+![](../img/btree-relative.svg)
+
+Insertions are only 1.5-2 faster than for `absl::btree`, which uses scalar code to do everything. My best guess why insertions are *that* slow is due to data dependency: since the tree nodes may change, the CPU can't start processing the next query before the previous one finishes (the [true latency](../s-tree/#comparison-with-stdlower_bound) of both queries is roughly equal and ~3x of the reciprocal throughput of `lower_bound`).
+
+![](../img/btree-absl.svg)
+
+When the structure size is small, the [reciprocal throughput](../s-tree/#comparison-with-stdlower_bound) of `lower_bound` increases in discrete steps: it starts with 3.5ns when there is only the root to visit, then grows to 6.5ns (two nodes), and then to 12ns (three nodes), and then hits the L2 cache (not shown on the graphs) and starts increasing more smoothly, but still with noticeable spikes when the tree height increases.
+
+Interestingly, B− tree outperforms `absl::btree` even when it only stores a single key: it takes around 5ns stalling on [branch misprediction](/hpc/pipelining/branching/), while (the search in) the B− tree is entirely branchless.
+
+### Possible Optimizations
+
+In our previous endeavors in data structure optimization, it helped a lot to make as many variables as possible compile-time constants: the compiler can hardcode these constants into the machine code, simplify the arithmetic, unroll all the loops, and do many other nice things for us.
+
+This would not be a problem at all if our tree were of constant height, but it is not. It is *largely* constant, though: the height rarely changes, and in fact, under the constraints of the benchmark, the maximum height was only 6.
+
+What we can do is pre-compile the `insert` and `lower_bound` functions for several different compile-time constant heights and switch between them as the tree grows. The idiomatic C++ way is to use virtual functions, but I prefer to be explicit and use raw function pointers like this:
+
+```c++
+void (*insert_ptr)(int);
+int (*lower_bound_ptr)(int);
+
+void insert(int x) {
+    insert_ptr(x);
+}
+
+int lower_bound(int x) {
+    return lower_bound_ptr(x);
+}
+```
+
+We now define template functions that have the tree height as a parameter, and in the grow-tree block inside the `insert` function, we change the pointers as the tree grows:
+
+```c++
+template 
+void insert_impl(int _x) {
+    // ...
+}
+
+template 
+void insert_impl(int _x) {
+    // ...
+    if (/* tree grows */) {
+        // ...
+        insert_ptr = &insert_impl;
+        lower_bound_ptr = &lower_bound_impl;
+    }
+}
+
+template <>
+void insert_impl<10>(int x) {
+    std::cerr << "This depth was not supposed to be reached" << std::endl;
+    exit(1);
+}
+```
+
+
+
+I tried but could not get any performance improvement with this, but I still have high hope for this approach because the compiler can (theoretically) remove `sk` and `si`, completely removing any temporary storage and only reading and computing everything once, greatly optimizing the `insert` procedure.
+
+Insertion can also probably be optimized by using a larger block size as node splits would become rare, but this comes at the cost of slower lookups. We could also try different node sizes for different layers: leaves should probably be larger than the internal nodes.
+
+**Another idea** is to move extra keys on insert to a sibling node, delaying the node split as long as possible.
+
+One such particular modification is known as the B* tree. It moves the last key to the next node if the current one is full, and when both nodes become full, it jointly splits both of them, producing three nodes that are ⅔ full. This reduces the memory overhead (the nodes will be ⅚ full on average) and increases the fanout factor, reducing the height, which helps all operations.
+
+This technique can even be extended to, say, three-to-four splits, although further generalization would come at the cost of a slower `insert`.
+
+**And yet another idea** is to get rid of (some) pointers. For example, for large trees, we can probably afford a small [S+ tree](../s-tree) for $16 \cdot 17$ or so elements as the root, which we rebuild from scratch on each infrequent occasion when it changes. You can't extend it to the whole tree, unfortunately: I believe there is a paper somewhere saying that we can't turn a dynamic structure fully implicit without also having to do $\Omega(\sqrt n)$ operations per query.
+
+We could also try some non-tree data structures, such as the [skip list](https://en.wikipedia.org/wiki/Skip_list). There has even been a [successful attempt to vectorize it](https://doublequan.github.io/) — although the speedup was not that impressive. I have low hope that skip-list, in particular, can be improved, although it may achieve a higher total throughput in the concurrent setting.
+
+### Other Operations
+
+To *delete* a key, we can similarly locate and remove it from a node with the same mask-store trick. After that, if the node is at least half-full, we're done. Otherwise, we try to borrow a key from the next sibling. If the sibling has more than $\frac{B}{2}$ keys, we append its first key and shift its keys one to the left. Otherwise, both the current node and the next node have less than $\frac{B}{2}$ keys, so we can merge them, after which we go to the parent and iteratively delete a key there.
+
+Another thing we may want to implement is *iteration*. Bulk-loading each key from `l` to `r` is a very common pattern — for example, in `SELECT abc ORDER BY xyz` type of queries in databases — and B+ trees usually store pointers to the next node in the data layer to allow for this type of rapid iteration. In B− trees, as we're using a much smaller node size, we can experience [pointer chasing](/hpc/cpu-cache/latency/) problems if we do this. Going to the parent and reading all its $B$ pointers is probably faster as it negates this problem. Therefore, a stack of ancestors (the `sk` and `si` arrays we used in `insert`) can serve as an iterator and may even be better than separately storing pointers in nodes.
+
+We can easily implement almost everything that `std::set` does, but the B− tree, like any other B-tree, is very unlikely to become a drop-in replacement to `std::set` due to the requirement of pointer stability: a pointer to an element should remain valid unless the element is deleted, which is hard to achieve when we split and merge nodes all the time. This is a major problem not only for search trees but most data structures in general: having both pointer stability and high performance at the same time is next to impossible.
+
+
+
+## Acknowledgements
+
+Thanks to [Danila Kutenin](https://danlark.org/) from Google for meaningful discussions of applicability and the usage of B-trees in Abseil.
+
+
diff --git a/content/english/hpc/data-structures/binary-search.md b/content/english/hpc/data-structures/binary-search.md
index efc09c2f..6426ddde 100644
--- a/content/english/hpc/data-structures/binary-search.md
+++ b/content/english/hpc/data-structures/binary-search.md
@@ -1,565 +1,601 @@
 ---
 title: Binary Search
 weight: 1
+published: true
 ---
 
-The most fascinating showcases of performance engineering are not intricate 5-10% speed improvements of some databases, but multifold optimizations of some basic algorithms you can find in a textbook — the ones that are so simple that it would never even occur to try to optimize them. These kinds of optimizations are simple and instructive, and can very much be adopted elsewhere. Yet, with remarkable periodicity, these can be optimized to ridiculous levels of performance.
+
 
-In this article, we will focus on such an algorithm — binary search — and significantly improve its efficiency by rearranging elements of a sorted array in a more cache-friendly way. We will develop two versions, each achieving 4-7x speedup over the standard `std::lower_bound`, depending on the cache level and available memory bandwidth:
+While improving the speed of user-facing applications is the end goal of performance engineering, people don't really get excited over 5-10% improvements in some databases. Yes, this is what software engineers are paid for, but these types of optimizations tend to be too intricate and system-specific to be readily generalized to other software.
 
-- The first one uses what is known as *Eytzinger layout*, which is also a popular layout for other structures such as binary heaps. Our minimalistic implementation is only ~15 lines.
-- The second one is its generalization based on *B-tree layout*, which is more bulky. Although it uses SIMD, which technically disqualifies it from being binary search.
+Instead, the most fascinating showcases of performance engineering are multifold optimizations of textbook algorithms: the kinds that everybody knows and deemed so simple that it would never even occur to try to optimize them in the first place. These optimizations are simple and instructive and can very much be adopted elsewhere. And they are surprisingly not as rare as you'd think.
 
-A brief review of [CPU cache system](../cpu-cache) is strongly advised.
+
 
-## Why Binary Search is Slow
+In this section, we focus on one such fundamental algorithm — *binary search* — and implement two of its variants that are, depending on the problem size, up to 4x faster than `std::lower_bound`, while being under just 15 lines of code.
 
-Before jumping to optimized variants, let's briefly discuss the reasons why the textbook binary search is slow in the first place.
+The first algorithm achieves that by removing [branches](/hpc/pipelining/branching), and the second also optimizes the memory layout to achieve better [cache system](/hpc/cpu-cache) performance. This technically disqualifies it from being a drop-in replacement for `std::lower_bound` as it needs to permute the elements of the array before it can start answering queries — but I can't recall a lot of scenarios where you obtain a sorted array but can't afford to spend linear time on preprocessing.
 
-Here is the standard way of searching for the first element not less than $x$ in a sorted array of $n$ integers:
+
 
-If compiler is successful in piercing through the abstractions, it compiles to roughly the same machine code.
+The usual disclaimer: the CPU is a [Zen 2](https://www.7-cpu.com/cpu/Zen2.html), the RAM is a [DDR4-2666](/hpc/cpu-cache/), and the compiler we will be using by default is Clang 10. The performance on your machine may be different, so I highly encourage to [go and test it](https://godbolt.org/z/14rd5Pnve) for yourself.
 
-### Spacial Locality
+
 
-![](../img/binary-heat.png)
+## Binary Search
+
+
+
+Here is the standard way of searching for the first element not less than `x` in a sorted array `t` of `n` integers that you can find in any introductory computer science textbook:
+
+```c++
+int lower_bound(int x) {
     int l = 0, r = n - 1;
     while (l < r) {
-        volatile int s = 0; // volatile to prevent compiler from cutting this code out
-        for (int i = 0; i < 10; i++)
-            s += i;
-        int t = (l + r) / 2;
-        if (a[t] >= x)
-            r = t;
+        int m = (l + r) / 2;
+        if (t[m] >= x)
+            r = m;
         else
-            l = t + 1;
+            l = m + 1;
     }
-    return a[l];
+    return t[l];
 }
 ```
 
-Contains an "if" that is impossible to predict better than a coin flip.
-
-It's not illegal: ternary operator is replaced with something like `CMOV` 
-
-```cpp
-int lower_bound(int x) {
-    int base = 0, len = n;
-    while (len > 1) {
-        int half = len / 2;
-        base = (a[base + half] >= x ? base : base + half);
-        len -= half;
+
+
+Find the middle element of the search range, compare it to `x`, shrink the range in half. Beautiful in its simplicity.
+
+A similar approach is employed by `std::lower_bound`, except that it needs to be more generic to support containers with non-random-access iterators and thus uses the first element and the size of the search interval instead of the two of its ends. To this end, implementations from both [Clang](https://github.com/llvm-mirror/libcxx/blob/78d6a7767ed57b50122a161b91f59f19c9bd0d19/include/algorithm#L4169) and [GCC](https://github.com/gcc-mirror/gcc/blob/d9375e490072d1aae73a93949aa158fcd2a27018/libstdc%2B%2B-v3/include/bits/stl_algobase.h#L1023) use this metaprogramming monstrosity:
+
+```c++
+template 
+_LIBCPP_CONSTEXPR_AFTER_CXX17 _ForwardIterator
+__lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_, _Compare __comp)
+{
+    typedef typename iterator_traits<_ForwardIterator>::difference_type difference_type;
+    difference_type __len = _VSTD::distance(__first, __last);
+    while (__len != 0)
+    {
+        difference_type __l2 = _VSTD::__half_positive(__len);
+        _ForwardIterator __m = __first;
+        _VSTD::advance(__m, __l2);
+        if (__comp(*__m, __value_))
+        {
+            __first = ++__m;
+            __len -= __l2 + 1;
+        }
+        else
+            __len = __l2;
     }
-    return a[base];
+    return __first;
 }
 ```
 
-But this is not the largest problem. The real problem is that it waits for its operands, and the results still can't be predicted.
+If the compiler is successful in removing the abstractions, it compiles to roughly the same machine code and yields roughly the same average latency, which [expectedly](/hpc/cpu-cache/latency) grows with the array size:
 
-The running time of this (or any) algorithm is not just the "cost" of all its arithmetic operations, but rather this cost *plus* the time spent waiting for data to be fetched from memory. Thus, depending on the algorithm and problem limitations, it can be CPU-bound or memory-bound, meaning that the running time is dominated by one of its components.
+![](../img/search-std.svg)
 
-Can be fetched ahead, but there is only 50% chance we will get it right on the first layer, then 25% chance on second and so on. We could do 2, 4, 8 and so on fetches, but these would grow exponentially.
+Since most people don't implement binary search by hand, we will use `std::lower_bound` from Clang as the baseline.
 
-IMAGE HERE
+### The Bottleneck
 
-If array is large enough—usually around the point where it stops fitting in cache and fetches become significantly slower—the running time of binary search becomes dominated by memory fetches.
+Before jumping to the optimized implementations, let's briefly discuss why binary search is slow in the first place.
 
+If you run `std::lower_bound` with [perf](/hpc/profiling/events), you'll see that it spends most of its time on a [conditional jump](/hpc/architecture/loops) instruction:
 
-So, to sum up: ideally, we'd want some layout that is both blocks, and higher-order blocks to be placed in groups, and also to be capable.
+```nasm
+       │35:   mov    %rax,%rdx
+  0.52 │      sar    %rdx
+  0.33 │      lea    (%rsi,%rdx,4),%rcx
+  4.30 │      cmp    (%rcx),%edi
+ 65.39 │    ↓ jle    b0
+  0.07 │      sub    %rdx,%rax
+  9.32 │      lea    0x4(%rcx),%rsi
+  0.06 │      dec    %rax
+  1.37 │      test   %rax,%rax
+  1.11 │    ↑ jg     35
+```
 
-We can overcome this by enumerating and permuting array elements in a more cache-friendly way. The numeration we will use is actually half a millennium old, and chances are you already know it.
+This [pipeline stall](/hpc/) stops the search from progressing, and it is mainly caused by two [factors](/hpc/pipelining/hazards):
 
-## Eytzinger Layout
+- We suffer a *control hazard* because we have a [branch](/hpc/pipelining/branching) that is impossible to predict (queries and keys are drawn independently at random), and the processor has to halt for 10-15 cycles to flush the pipeline and fill it back on each branch mispredict.
+- We suffer a *data hazard* because we have to wait for the preceding comparison to complete, which in turn waits for one of its operands to be fetched from the memory — and it [may take](/hpc/cpu-cache/latency) anywhere between 0 and 300 cycles, depending on where it is located.
 
-**Michaël Eytzinger** is a 16th century Austrian nobleman known for his work on genealogy, particularily for a system for numbering ancestors called *ahnentafel* (German for "ancestor table").
+Now, let's try to get rid of these obstacles one by one.
 
-Ancestry mattered a lot back then, but writing down that data was expensive. *Ahnentafel* allows displaying a person's genealogy compactly, without wasting extra space by drawing diagrams.
+## Removing Branches
 
-It lists a person's direct ancestors in a fixed sequence of ascent. First, the person theirself is listed as number 1, and then, recursively, for each person numbered $k$, their father is listed as $2k$ and their mother as $(2k+1)$.
+We can replace branching with [predication](/hpc/pipelining/branchless). To make the task easier, we can adopt the STL approach and rewrite the loop using the first element and the size of the search interval (instead of its first and last element):
 
-Here is the example for Paul I, the great-grandson of Peter I, the Great:
+```c++
+int lower_bound(int x) {
+    int *base = t, len = n;
+    while (len > 1) {
+        int half = len / 2;
+        if (base[half - 1] < x) {
+            base += half;
+            len = len - half;
+        } else {
+            len = half;
+        }
+    }
+    return *base;
+}
+```
 
-1. Paul I
-2. Peter III (Paul's father)
-3. Catherine II (Paul's mother)
-4. Charles Frederick (Peter's father, Paul's paternal grandfather)
-5. Anna Petrovna (Peter's mother, Paul's paternal grandmother)
-6. Christian August (Catherine's father, Paul's maternal grandfather)
-7. Johanna Elisabeth (Catherine's mother, Paul's maternal grandmother)
+Note that, on each iteration, `len` is essentially just halved and then either floored or ceiled, depending on how the comparison went. This conditional update seems unnecessary; to avoid it, we can simply say that it's always ceiled:
 
-Apart from being compact, it has some nice properties, like that all even-numbered persons are male and all odd-numbered (possibly apart from 1) are female.
+```c++
+int lower_bound(int x) {
+    int *base = t, len = n;
+    while (len > 1) {
+        int half = len / 2;
+        if (base[half - 1] < x)
+            base += half;
+        len -= half; // = ceil(len / 2)
+    }
+    return *base;
+}
+```
 
-One can also find the number of a particular ancestor only knowing the genders of their descendants. For example, Peter the Great's bloodline is Paul I → Peter III → Anna Petrovna → Peter the Great, so his number should be $((1 \times 2) \times 2 + 1) \times 2 = 10$.
+This way, we only need to update the first element of the search interval with a [conditional move](/hpc/pipelining/branchless/) and halve its size on each iteration:
 
-**In computer science**, this enumeration has been widely used for implicit (i. e. pointer-free) implementation of heaps, segment trees, and other binary tree structures, where instead of names it stores underlying array items.
+```c++
+int lower_bound(int x) {
+    int *base = t, len = n;
+    while (len > 1) {
+        int half = len / 2;
+        base += (base[half - 1] < x) * half; // will be replaced with a "cmov"
+        len -= half;
+    }
+    return *base;
+}
+```
 
-This is how this layout will look when applied to binary search:
+
 
-![](../img/eytzinger.png)
+Note that this loop is not always equivalent to the standard binary search. Since it always rounds *up* the size of the search interval, it accesses slightly different elements and may perform one comparison more than needed. Apart from simplifying computations on each iteration, it also makes the number of iterations constant if the array size is constant, removing branch mispredictions completely.
 
-You can immediately see how its temporal locality is better (in fact, theoretically optimal) as the elements closer to the root are closer to the beginning of the array, and thus are more likely to be fetched from cache.
+As typical for predication, this trick is very fragile to compiler optimizations — depending on the compiler and how the function is invoked, it may still leave a branch or generate suboptimal code. It works fine on Clang 10, yielding a 2.5-3x improvement on small arrays:
 
-![](../img/eytzinger-search.png)
-![](../img/eytzinger-heat.png)
+
 
-### Construction
+![](../img/search-branchless.svg)
 
-Here is a function that constructs Eytzinger array by traversing the original search tree. 
+One interesting detail is that it performs worse on large arrays. It seems weird: the total delay is dominated by the RAM latency, and since it does roughly the same memory accesses as the standard binary search, it should be roughly the same or even slightly better.
 
-It takes two indexes $i$ and $k$—one in the original array and one in constructed—and recursively goes to two branches until a leaf node is reached, which could simply be checked by asserting $k \leq n$ as Eytzinger array should have same number of items.
+The real question you need to ask is not why the branchless implementation is worse but why the branchy version is better. It happens because when you have branching, the CPU can [speculate](/hpc/pipelining/branching/) on one of the branches and start fetching either the left or the right key before it can even confirm that it is the right one — which effectively acts as implicit [prefetching](/hpc/cpu-cache/prefetching).
 
-```cpp
-const int n = 1e5;
-int a[n], b[n+1];
+For the branchless implementation, this doesn't happen, as `cmov` is treated as every other instruction, and the branch predictor doesn't try to peek into its operands to predict the future. To compensate for this, we can prefetch the data in software by explicitly requesting the left and right child key:
 
-int eytzinger(int i = 0, int k = 1) {
-    if (k <= n) {
-        i = eytzinger(i, 2 * k);
-        b[k] = a[i++];
-        i = eytzinger(i, 2 * k + 1);
+```c++
+int lower_bound(int x) {
+    int *base = t, len = n;
+    while (len > 1) {
+        int half = len / 2;
+        len -= half;
+        __builtin_prefetch(&base[len / 2 - 1]);
+        __builtin_prefetch(&base[half + len / 2 - 1]);
+        base += (base[half - 1] < x) * half;
     }
-    return i;
+    return *base;
 }
 ```
 
-Despite being recursive, this is actually a really fast implementation as all memory reads are sequential.
+
 
-Note that the first element is left unfilled and the whole array is essencially 1-shifted. This will actually turn out to be a huge performance booster.
+With prefetching, the performance on large arrays becomes roughly the same:
 
-## Binary search implementation
+![](../img/search-branchless-prefetch.svg)
 
-We can now descend this array using only indices: we just start with $k=1$ and execute $k := 2k$ if we need to go left and $k := 2k + 1$ if we need to go right. We don't even need to store and recalculate binary search boundaries anymore.
+The graph still grows faster as the branchy version also prefetches "grandchildren," "great-grandchildren," and so on — although the usefulness of each new speculative read diminishes exponentially as the prediction is less and less likely to be correct.
 
-The only problem arises when we need to restore the index of the resulting element, as $k$ may end up not pointing to a leaf node. Here is an example of how that can happen:
+In the branchless version, we could also fetch ahead by more than one layer, but the number of fetches we'd need also grows exponentially. Instead, we will try a different approach to optimize memory operations.
 
-```python
-    array:  1 2 3 4 5 6 7 8
-eytzinger:  4 2 5 1 6 3 7 8
-1st range:  ---------------  k := 1
-2nd range:  -------          k := 2*k      (=2)
-3rd range:      ---          k := 2*k + 1  (=5)
-4th range:        -          k := 2*k + 1  (=11)
-```
+## Optimizing the Layout
 
-Here we query array of $[1, …, 8]$ for the lower bound of $x=4$. We compare it against $4$, $2$ and $5$, and go left-right-right and end up with $k = 11$, which isn't even a valid array index.
+The memory requests we perform during binary search form a very specific access pattern:
+
+![](../img/binary-search.png)
 
-Note that, unless the answer is the last element of the array, we compare $x$ against it at some point, and after we learn that it is not less than $x$, we start comparing $x$ against elements to the left, and all these comparisons will evaluate true (i. e. leading to the right). Hence, the solution to restoring the resulting element is to cancel some number of right turns.
+How likely is it that the elements on each request are cached? How good is their [data locality](/hpc/external-memory/locality/)?
 
-This can be done in an elegant way by observing that the right turns are recorded in the binary notation of $k$ as 1-bits, and so we just need to find the number of trailing ones in the binary notation and right-shift $k$ by exactly that amount.
+- *Spatial locality* seems to be okay for the last 3 to 4 requests that are likely to be on the same [cache line](/hpc/cpu-cache/cache-lines) — but all the previous requests require huge memory jumps.
+- *Temporal locality* seems to be okay for the first dozen or so requests — there aren't that many different comparison sequences of this length, so we will be comparing against the same middle elements over and over, which are likely to be cached.
 
-To do this we can invert the number (`~x`) and call "find first set" instruction available on most systems. In GCC, the corresponding builtin is `__builtin_ffs`.
+To illustrate how important the second type of cache sharing is, let's try to pick the element we will compare to on each iteration randomly among the elements of the search interval, instead of the middle one:
 
-```cpp
-int search(int x) {
-    int k = 1;
-    while (k <= n) {
-        if (b[k] >= x)
-            k = 2 * k;
-        else
-            k = 2 * k + 1;
-    }
-    k >>= __builtin_ffs(~k);
-    return b[k];
+```c++
+int lower_bound(int x) {
+    int l = 0, r = n - 1;
+    while (l < r) {
+        int m = l + rand() % (r - l);
+        if (t[m] >= x)
+            r = m;
+        else
+            l = m + 1;
+    }
+    return t[l];
 }
 ```
 
-Note that $k$ will be zero if binary search returned no result (i. e. all elements are less than $x$ and all turns were right-turns that got canceled). In that case, you can put a special flag in the first element of `b`.
+[Theoretically](#appendix-random-binary-search), this randomized binary search is expected to do 30-40% more comparisons than the normal one, but on a real computer, the running time goes ~6x on large arrays:
 
-This is already 2-3 times faster than `std::lower_bound`, but we are not going to stop there and apply a series of small incremental improvements.
+![](../img/search-random.svg)
 
-### Branch-free
+This isn't just caused by the `rand()` call being slow. You can clearly see the point on the L2-L3 boundary where memory latency outweighs the random number generation and [modulo](/hpc/arithmetic/division). The performance degrades because all of the fetched elements are unlikely to be cached and not just some small suffix of them.
 
-Compiled program instructions are stored and loaded from main memory too, just as normal data. They are fetched during execution by similar mechanisms, and they have a separate instruction cache. In fact, in large applications you can sometimes remove blocks of literally unused code, and the program may run faster because of better instruction cache hit rate, but this is a topic for another article.
+Another potential negative effect is that of [cache associativity](/hpc/cpu-cache/associativity). If the array size is a multiple of a large power of two, then the indices of these "hot" elements will also be divisible by some large powers of two and map to the same cache line, kicking each other out. For example, binary searching over arrays of size $2^{20}$ takes about ~360ns per query while searching over arrays of size $(2^{20} + 123)$ takes ~300ns — a 20% difference. There are [ways](https://en.wikipedia.org/wiki/Fibonacci_search_technique) to fix this problem, but to not get distracted from more pressing matters, we are just going to ignore it: all array sizes we use are in the form of $\lfloor 1.17^k \rfloor$ for integer $k$ so that any cache side effects are unlikely.
 
-To avoid performance hits caused by memory latency here, CPU loads 20-ish instructions ahead of time, but to do this it needs to know ahead of time which instructions to fetch. If a program has conditional execution (if-s, while-s, for-s) there is no option other than to take a guess.
+The real problem with our memory layout is that it doesn't make the most efficient use of temporal locality because it groups hot and cold elements together. For example, we likely store the element $\lfloor n/2 \rfloor$, which we request the first thing on each query, in the same cache line with $\lfloor n/2 \rfloor + 1$, which we almost never request.
 
-Branch misprediction (guessing "wrong" branch of "if") costs around 10-20 cycles. To partially negate this penalty, hardware [branch predictors](https://en.wikipedia.org/wiki/Branch_predictor) were developed. These are complex ad-hoc systems that use statistical methods—some even use simple [neural networks](https://en.wikipedia.org/wiki/Branch_predictor#Neural_branch_prediction)—to make a more accurate guess.
+
 
-In case of binary search, if all of our data is random, branch prediction doesn't help at all, just because it can't: all comparisons are 50-50. This is why we need to get rid of if-s and rewrite our main loop the following way:
+Here is the heatmap visualizing the expected frequency of comparisons for a 31-element array:
 
-```cpp
-while (k <= n)
-    k = 2 * k + (b[k] < x);
-```
+![](../img/binary-heat.png)
 
-It also directly saves us from executing a few unnecessary arithmetic instructions.
+So, ideally, we'd want a memory layout where hot elements are grouped with hot elements, and cold elements are grouped with cold elements. And we can achieve this if we permute the array in a more cache-friendly way by renumbering them. The numeration we will use is actually half a millennium old, and chances are, you already know it.
 
-### Prefetching
+### Eytzinger Layout
 
-Compiler doesn't like when CPU is sitting idle while waiting for memory fetches. Sometimes it can take a guess about which cache line is going to be needed soon and fetch it ahead of time (recall that bandwidth-latency product is usually much larger than 1).
+**Michaël Eytzinger** is a 16th-century Austrian nobleman known for his work on genealogy, particularly for a system for numbering ancestors called *ahnentafel* (German for "ancestor table").
 
-This works well for simple access patterns, like iterating over array in increasing or decreasing order, but for something complex like what we have here it's not going to perform well.
+Ancestry mattered a lot back then, but writing down that data was expensive. *Ahnentafel* allows displaying a person's genealogy compactly, without wasting extra space by drawing diagrams.
 
-As we know a bit more about our problem than the compiler does, we can explicitly tell it to prefetch a cache line we need. This is done by `__builtin_prefetch` in GCC:
+It lists a person's direct ancestors in a fixed sequence of ascent. First, the person themselves is listed as number 1, and then, recursively, for each person numbered $k$, their father is listed as $2k$ and their mother as $(2k+1)$.
 
-```cpp
-while (k <= n) {
-    __builtin_prefetch(b + k * block_size);
-    k = 2 * k + (b[k] < x);
-}
-```
+Here is the example for [Paul I](https://en.wikipedia.org/wiki/Paul_I_of_Russia), the great-grandson of [Peter the Great](https://en.wikipedia.org/wiki/Peter_the_Great):
 
-Here, `block_size` equals 16, which is precisely how many ints are needed to cover a cache line. When we reference cache line at `b + k * block_size`, we are referencing $k$'s grand-grandson (`block_size` = $2 \times 2 \times 2 \times 2$, or 4 left turns) and possibly some of his neighbours in his layer (recall that indexes at the same level are just consecutive numbers).
+1. Paul I
+2. Peter III (Paul's father)
+3. [Catherine II](https://en.wikipedia.org/wiki/Catherine_the_Great) (Paul's mother)
+4. Charles Frederick (Peter's father, Paul's paternal grandfather)
+5. Anna Petrovna (Peter's mother, Paul's paternal grandmother)
+6. Christian August (Catherine's father, Paul's maternal grandfather)
+7. Johanna Elisabeth (Catherine's mother, Paul's maternal grandmother)
 
-The whole point of doing this is that there is a good chance that we will prefetch an element that we will use later on $(i+4)$-th iteration. What chance, exactly? Well, it turns out that it is constant for each iteration.
+Apart from being compact, it has some nice properties, like that all even-numbered persons are male and all odd-numbered (possibly except for 1) are female. One can also find the number of a particular ancestor only knowing the genders of their descendants. For example, Peter the Great's bloodline is Paul I → Peter III → Anna Petrovna → Peter the Great, so his number should be $((1 \times 2) \times 2 + 1) \times 2 = 10$.
 
-### Memory allignment
+**In computer science**, this enumeration has been widely used for implicit (pointer-free) implementations of heaps, segment trees, and other binary tree structures — where instead of names, it stores underlying array items.
 
-Note that for each layer in the tree, except for the first 4 and possibly the last one, the number of nodes in that layer is divisible by 16, the block size. This means that the fraction of covered nodes on *each* iteration depends only on the position of the first offset of the array in respect to its cache line. But what is more important is that it can be made that all of $k$'s grand-grandchildren are covered by the same cache line.
+Here is how this layout looks when applied to binary search:
 
-The way to achieve this is to place the first element of the array to the 1st position (0-indexed) of a cache line, or placing the array itself on the beginning of a cache line, since its first (i. e. `b[0]`) element is blank by design. This way the next $1 + 2 + 4 + 8 = 15$ elements of first 4 layers will occupy the rest of the cache line, and the rest of the array is alligned in nice 16-element blocks of nodes that share a grandpa.
+![Note that the tree is slightly imbalanced (because of the last layer is continuous)](../img/eytzinger.png)
 
-We just need to ask memory manager to allocate our array on the beginning of a cache line (by default it allocates your arrays wherever it wants), and that's it. To do this, we can use `alignas` specifier:
+When searching in this layout, we just need to start from the first element of the array, and then on each iteration jump to either $2 k$ or $(2k + 1)$, depending on how the comparison went:
 
-```cpp
-alignas(64) int b[n+1];
-```
+![](../img/eytzinger-search.png)
 
-This is it. Now our algorithm is constantly prefetching 4 layers / cache lines ahead of time, which is covered by the bandwidth of our RAM. This way the effective latency is reduced by a factor of 4, and we're basically trading off bandwidth for latency.
+You can immediately see how its temporal locality is better (and, in fact, theoretically optimal) as the elements closer to the root are closer to the beginning of the array and thus are more likely to be fetched from the cache.
 
-### Complete implementation
+![](../img/eytzinger-heat.png)
 
-```cpp
-#pragma GCC optimize("O3")
-#include 
+Another way to look at it is that we write every even-indexed element to the end of the new array, then write every even-indexed element of the remaining ones right before them, and so on, until we place the root as the first element. 
 
-using namespace std;
+### Construction
 
-const int n = (1<<20);
-const int block_size = 16; // = 64 / 4 = cache_line_size / sizeof(int)
-alignas(64) int a[n], b[n+1];
+To construct the Eytzinger array, we could do this even-odd [filtering](/hpc/simd/shuffling/#permutations-and-lookup-tables) $O(\log n)$ times — and, perhaps, this is the fastest approach — but for brevity, we will instead build it by traversing the original search tree:
 
-int eytzinger(int i = 0, int k = 1) {
-    if (k <= n) {
-        i = eytzinger(i, 2 * k);
-        b[k] = a[i++];
-        i = eytzinger(i, 2 * k + 1);
-    }
-    return i;
-}
+```c++
+int a[n], t[n + 1]; // the original sorted array and the eytzinger array we build
+//              ^ we need one element more because of one-based indexing
 
-int search(int x) {
-    int k = 1;
-    while (k <= n) {
-        __builtin_prefetch(b + k * block_size);
-        k = 2 * k + (b[k] < x);
+void eytzinger(int k = 1) {
+    static int i = 0; // <- careful running it on multiple arrays
+    if (k <= n) {
+        eytzinger(2 * k);
+        t[k] = a[i++];
+        eytzinger(2 * k + 1);
     }
-    k >>= __builtin_ffs(~k);
-    return k;
 }
 ```
 
-Few more things to note:
+This function takes the current node number `k`, recursively writes out all elements to the left of the middle of the search interval, writes out the current element we'd compare against, and then recursively writes out all the elements on the right. It seems a bit complicated, but to convince yourself that it works, you only need three observations:
 
-* It works best when $n$ is a power of 2 or close to it, because otherwise the branch predictor will have a hard time figuring out whether or not to unroll the $(\log n)$-th cycle.
+- It writes exactly `n` elements as we enter the body of `if` for each `k` from `1` to `n` just once.
+- It writes out sequential elements from the original array as it increments the `i` pointer each time.
+- By the time we write the element at node `k`, we will have already written all the elements to its left (exactly `i`).
 
-* Its performance varies by cache size and array length, but stays >3x even on smaller arrays (<1MB).
+Despite being recursive, it is actually quite fast as all the memory reads are sequential, and the memory writes are only in $O(\log n)$ different memory blocks at a time. Maintaining the permutation is both logically and computationally harder to maintain though: adding an element to a sorted array only requires shifting a suffix of its elements one position to the right, while Eytzinger array practically needs to be rebuilt from scratch.
 
-* Preprocessing isn't costly. It is around 1% of the cost of firing the same number of queries as the array size.
+Note that this traversal and the resulting permutation are not exactly equivalent to the "tree" of vanilla binary search: for example, the left child subtree may be larger than the right child subtree — up to twice as large — but it doesn't matter much since both approaches result in the same $\lceil \log_2 n \rceil$ tree depth.
 
-* Modern hardware won't penalize you for prefetching cache lines that aren't yours, though this maybe be an issue for older CPUs, which can be solved by a simple `if` statement.
+Also note that the Eytzinger array is one-indexed — this will be important for performance later. You can put in the zeroth element the value that you want to be returned in the case when the lower bound doesn't exist (similar to `a.end()` for `std::lower_bound`).
 
-* For some reason, basic binary search implementation (the very first code block in this article) is already ~20% faster than `std::sort`.
+### Search Implementation
 
-## B-tree Layout
+We can now descend this array using only indices: we just start with $k=1$ and execute $k := 2k$ if we need to go left and $k := 2k + 1$ if we need to go right. We don't even need to store and recalculate the search boundaries anymore. This simplicity also lets us avoid branching:
 
-B-trees are basically $(k+1)$-ary trees, meaning that they store $k$ elements in each node and choose between $(k+1)$ possible branches instead of 2.
-
-They are widely used for indexing in databases, especially those that operate on-disk, because if $k$ is big, this allows large sequential memory accesses while reducing the height of the tree.
+```c++
+int k = 1;
+while (k <= n)
+    k = 2 * k + (t[k] < x);
+```
 
-To perform static binary searches, one can implement a B-tree in an implicit way, i. e. without actually storing any pointers and spending only $O(1)$ additional memory, and $k$ could be made equal to the cache line size so that each node request fetches exactly one cache line.
+The only problem arises when we need to restore the index of the resulting element, as $k$ does not directly point to it. Consider this example (its corresponding tree is listed above):
 
-![](../img/btree.png)
+
 
-Turns out, they have the same rate of growth but sligtly larger compute-tied constant. While the latter is explainable (our while loop only has like 5 instructions; can't outpace that), the former is surprising.
+
+    array:  0 1 2 3 4 5 6 7 8 9                            
+eytzinger:  6 3 7 1 5 8 9 0 2 4                            
+1st range:  ------------?------  k := 2*k     = 2   (6 ≥ 3)
+2nd range:  ------?------        k := 2*k     = 4   (3 ≥ 3)
+3rd range:  --?----              k := 2*k + 1 = 9   (1 < 3)
+4th range:      ?--              k := 2*k + 1 = 19  (2 < 3)
+5th range:        !                                        
+
-Let's assume that arithmetic costs nothing and do simple cache block analysis: + -* The Eytzinger binary search is supposed to be $4$ times faster if compute didn't matter, as it requests them ~4 times faster on average. +Here we query the array of $[0, …, 9]$ for the lower bound of $x=3$. We compare it against $6$, $3$, $1$, and $2$, go left-left-right-right, and end up with $k = 19$, which isn't even a valid array index. -* The B-tree makes $\frac{\log_{17} n}{\log_2 n} = \frac{\log n}{\log 17} \frac{\log 2}{\log n} = \frac{\log 2}{\log 17} \approx 0.245$ memory access per each request of binary search, i. e. it requests ~4 times less cache lines to fetch +The trick is to notice that, unless the answer is the last element of the array, we compare $x$ against it at some point, and after we've learned that it is not less than $x$, we go left exactly once and then keep going right until we reach a leaf (because we will only be comparing $x$ against lesser elements). Therefore, to restore the answer, we just need to "cancel" some number of right turns and then one more. -This explains why they have roughly the same slope. +This can be done in an elegant way by observing that the right turns are recorded in the binary representation of $k$ as 1-bits, and so we just need to find the number of trailing 1s in the binary representation and right-shift $k$ by exactly that number of bits plus one. To do this, we can invert the number (`~k`) and call the "find first set" instruction: -Note that this method, while being great for single-threaded world, is unlikely to make its way into database and heavy multi-threaded applications, because it sacrifices bandwidth to achieve low latency. +```c++ +int lower_bound(int x) { + int k = 1; + while (k <= n) + k = 2 * k + (t[k] < x); + k >>= __builtin_ffs(~k); + return t[k]; +} +``` -[Part 2](https://algorithmica.org/en/b-tree) explores efficient implementation of implicit static B-trees in bandwidth-constrained environment. +We run it, and… well, it doesn't look *that* good: +![](../img/search-eytzinger.svg) -## Implicit Static B-trees +The latency on smaller arrays is on par with the branchless binary search implementation — which isn't surprising as it is just two lines of code — but it starts taking off much sooner. The reason is that the Eytzinger binary search doesn't get the advantage of spatial locality: the last 3-4 elements we compare against are not in the same cache line anymore, and we have to fetch them separately. -This is a follow up on a [previous article](https://algorithmica.org/en/eytzinger) about using Eytzinger memory layout to speed up binary search. Here we use implicit (pointerless) B-trees accelerated with SIMD operations to perform search efficiently while using less memory bandwidth. +If you think about it deeper, you might object that the improved temporal locality should compensate for that. Before, we were using only about $\frac{1}{16}$-th of the cache line to store one hot element, and now we are using all of it, so the effective cache size is larger by a factor of 16, which lets us cover $\log_2 16 = 4$ more first requests. -It performs slightly worse on array sizes that fit lower layers of cache, but in low-bandwidth environments it can be up to 3x faster (or 7x faster than `std::lower_bound`). +But if you think about it more, you understand that this isn’t enough compensation. Caching the other 15 elements wasn’t completely useless, and also, the hardware prefetcher could fetch the neighboring cache lines of our requests. If this was one of our last requests, the rest of what we will be reading will probably be cached elements. So actually, the last 6-7 accesses are likely to be cached, not 3-4. -## B-tree layout +It seems like we did an overall stupid thing switching to this layout, but there is a way to make it worthwhile. -B-trees generalize the concept of binary search trees by allowing nodes to have more than two children. +### Prefetching -Instead of single key, a B-tree node contains up to $B$ sorted keys may have up to $(B+1)$ children, thus reducing the tree height in $\frac{\log_2 n}{\log_B n} = \frac{\log B}{\log 2} = \log_2 B$ times. +To hide the memory latency, we can use software prefetching similar to how we did for branchless binary search. But instead of issuing two separate prefetch instructions for the left and right child nodes, we can notice that they are neighbors in the Eytzinger array: one has index $2 k$ and the other $(2k + 1)$, so they are likely in the same cache line, and we can use just one instruction. -They were primarily developed for the purpose of managing on-disk databases, as their random access times are almost the same as reading 1MB of data sequentially, which makes the trade-off between number of comparisons and tree height beneficial. In our implementation, we will make each the size of each block equal to the cache line size, which in case of `int` is 16 elements. +This observation extends to the grand-children of node $k$ — they are also stored sequentially: -Normally, a B-tree node also stores $(B+1)$ pointers to its children, but we will only store keys and rely on pointer arithmetic, similar to the one used in Eytzinger array: +``` +2 * 2 * k = 4 * k +2 * 2 * k + 1 = 4 * k + 1 +2 * (2 * k + 1) = 4 * k + 2 +2 * (2 * k + 1) + 1 = 4 * k + 3 +``` -* The root node is numbered $0$. + -* Node $k$ has $(B+1)$ child nodes numbered $\{k \cdot (B+1) + i\}$ for $i \in [1, B]$. +Their cache line can also be fetched with one instruction. Interesting… what if we continue this, and instead of fetching direct children, we fetch ahead as many descendants as we can cramp into one cache line? That would be $\frac{64}{4} = 16$ elements, our great-great-grandchildren with indices from $16k$ to $(16k + 15)$. -Keys are stored in a 2d array in non-decreasing order. If the length of the initial array is not a multiple of $B$, the last block is padded with the largest value if its data type. +Now, if we prefetch just one of these 16 elements, we will probably only get some but not all of them, as they may cross a cache line boundary. We can prefetch the first *and* the last element, but to get away with just one memory request, we need to notice that the index of the first element, $16k$, is divisible by $16$, so its memory address will be the base address of the array plus something divisible by $16 \cdot 4 = 64$, the cache line size. If the array were to begin on a cache line, then these $16$ great-great-grandchildren elements will be guaranteed to be on a single cache line, which is just what we needed. -```cpp -const int nblocks = (n + B - 1) / B; -alignas(64) int btree[nblocks][B]; +Therefore, we only need to [align](/hpc/cpu-cache/alignment) the array: -int go(int k, int i) { - return k * (B + 1) + i + 1; -} +```c++ +t = (int*) std::aligned_alloc(64, 4 * (n + 1)); ``` -In the code, we use zero-indexation for child nodes. - -## Construction - -We can construct B-tree similarly by traversing the search tree. +And then prefetch the element indexed $16 k$ on each iteration: -```cpp -void build(int k = 0) { - static int t = 0; - if (k < nblocks) { - for (int i = 0; i < B; i++) { - build(go(k, i)); - btree[k][i] = (t < n ? a[t++] : INF); - } - build(go(k, B)); +```c++ +int lower_bound(int x) { + int k = 1; + while (k <= n) { + __builtin_prefetch(t + k * 16); + k = 2 * k + (t[k] < x); } + k >>= __builtin_ffs(~k); + return t[k]; } ``` -It is correct, because each value of initial array will be copied to a unique position in the resulting array, and the tree height is $\Theta(\log_{B+1} n)$, because $k$ is multiplied by $(B + 1)$ each time a child node is created. +The performance on large arrays improves 3-4x from the previous version and ~2x compared to `std::lower_bound`. Not bad for just two more lines of code: -Note that this approach causes a slight imbalance: "lefter" children may have larger respective ranges. +![](../img/search-eytzinger-prefetch.svg) -## Basic Search +Essentially, what we do here is hide the latency by prefetching four steps ahead and overlapping memory requests. Theoretically, if the compute didn't matter, we would expect a ~4x speedup, but in reality, we get a somewhat more moderate speedup. -Here is a short but rather inefficient implementation that we will improve later: +We can also try to prefetch further than that four steps ahead, and we don't even have to use more than one prefetch instruction for that: we can try to request only the first cache line and rely on the hardware to prefetch its neighbors. This trick may or may not improve actual performance — depends on the hardware: -```cpp -int search(int x) { - int k = 0, res = INF; - start: // the only justified usage of the goto statement - // as doing otherwise would add extra inefficiency and more code - while (k < nblocks) { - for (int i = 0; i < B; i++) { - if (btree[k][i] >= x) { - res = btree[k][i]; - k = go(k, i); - goto start; - } - } - k = go(k, B); - } - return res; -} +```c++ +__builtin_prefetch(t + k * 32); ``` -The issue here is that it runs a linear search on the whole array, and also that it has lots of conditionals that costs much more than just comparing integers. +Also, note that the last few prefetch requests are actually not needed, and in fact, they may even be outside the memory region allocated for the program. On most modern CPUs, invalid prefetch instructions get converted into no-ops, so it isn't a problem, but on some platforms, this may cause a slowdown, so it may make sense, for example, to split off the last ~4 iterations from the loop to try to remove them. -Here are some ideas to counter this: +This prefetching technique allows us to read up to four elements ahead, but it doesn't really come for free — we are effectively trading off excess memory [bandwidth](/hpc/cpu-cache/bandwidth) for reduced [latency](/hpc/cpu-cache/latency). If you run more than one instance at a time on separate hardware threads or just any other memory-intensive computation in the background, it will significantly [affect](/hpc/cpu-cache/sharing) the benchmark performance. -* We could unroll the loop so that it performs $B$ comparisons unconditionally and computes index of the right child node. +But we can do better. Instead of fetching four cache lines at a time, we could fetch four times *fewer* cache lines. And in the [next section](../s-tree), we will explore the approach. -* We could run a tiny binary search to get the right index, but there is considerable overhead to this. + -Back in the 90s, computer engineers discovered that you can get more bang for a buck by adding circuits that do more useful work per cycle than just trying to increase CPU clock rate which [can't continue forever](https://en.wikipedia.org/wiki/Speed_of_light). +### Removing the Last Branch -This worked [particularly well](https://finance.yahoo.com/quote/NVDA/) for parallelizable workloads like video game graphics where just you need to perform the same operation over some array of data. This this is how the concept of *SIMD* became a thing, which stands for *single instruction, multiple data*. +Just one finishing touch: did you notice the bumpiness of the Eytzinger search? This isn't random noise — let's zoom in: -Modern hardware can do [lots of stuff](https://software.intel.com/sites/landingpage/IntrinsicsGuide) under this paradigm, leveraging *data-level parallelism*. For example, the simplest thing you can do on modern Intel CPUs is to: +![](../img/search-eytzinger-small.svg) -1. load 256-bit block of ints (which is $\frac{256}{32} = 8$ ints), +The latency is ~10ns higher for the array sizes in the form of $1.5 \cdot 2^k$. These are mispredicted branches from the loop itself — the last branch, to be exact. When the array size is far from a power of two, it is hard to predict whether the loop will make $\lfloor \log_2 n \rfloor$ or $\lfloor \log_2 n \rfloor + 1$ iterations, so we have a 50% chance to suffer exactly one branch mispredict. -2. load another 256-bit block of ints, +One way to address it is to pad the array with infinities to the closest power of two, but this wastes memory. Instead, we get rid of that last branch by always executing a constant minimum number of iterations and then using predication to optionally make the last comparison against some dummy element — that is guaranteed to be less than $x$ so that its comparison will be canceled: -3. add them together, +```c++ +t[0] = -1; // an element that is less than x +iters = std::__lg(n + 1); -4. write the result somewhere else +int lower_bound(int x) { + int k = 1; -…and this whole transaction costs the same as loading and adding just two ints—which means we can do 8 times more work. Magic! + for (int i = 0; i < iters; i++) + k = 2 * k + (t[k] < x); -So, as we promised before, we will perform all $16$ comparisons to compute the index of the right child node, but we leverage SIMD instructions to do it efficiently. Just to clarify—we want to do something like this: + int *loc = (k <= n ? t + k : t); + k = 2 * k + (*loc < x); + + k >>= __builtin_ffs(~k); -```cpp -int mask = (1 << B); -for (int i = 0; i < B; i++) - mask |= (btree[k][i] >= x) << i; -int i = __builtin_ffs(mask) - 1; -// now i is the number of the correct child node + return t[k]; +} ``` -…but ~8 times faster. +The graph is now smooth, and on small arrays, it is just a couple of cycles slower than the branchless binary search: -Actually, compiler quite often produces very optimized code that leverages these instructions for certain types of loops. This is called auto-vectorization, and this is the reason why a loop that sums up an array of `short`s is faster (theoretically by a factor of two) than the same loop for `int`s: you can fit more elements on the same 256-bit block. Sadly, this is not our case, as we have loop-carried dependencies. +![](../img/search-eytzinger-branchless.svg) -The algorithm we will implement: +Interestingly, now GCC fails to replace the branch with `cmov`, but Clang doesn't. 1-1. -1. Somewhere before the main loop, convert $x$ to a vector of $8$ copies of $x$. +### Appendix: Random Binary Search -2. Load the keys stored in node into another 256-bit vector. +By the way, finding the exact expected number of comparisons for random binary search is quite an interesting math problem in and of itself. Try solving it yourself first! -3. Compare these two vectors. This returns a 256-bit mask in which pairs that compared "greater than" are marked with ones. +The way to compute it *algorithmically* is through dynamic programming. If we denote $f_n$ as the expected number of comparisons to find a random lower bound on a search interval of size $n$, it can be calculated from the previous $f_n$ by considering all the $(n - 1)$ possible splits: -4. Create a 8-bit mask out of that and return it. Then you can feed it to `__builtin_ffs`. +$$ +f_n = \sum_{l = 1}^{n - 1} \frac{1}{n-1} \cdot \left( f_l \cdot \frac{l}{n} + f_{n - l} \cdot \frac{n - l}{n} \right) + 1 +$$ -This is how it looks using C++ intrinsics, which are basically built-in wrappers for raw assembly instructions: +Directly applying this formula gives us an $O(n^2)$ algorithm, but we can optimize it by rearranging the sum like this: -```cpp -// SIMD vector type names are weird and tedious to type, so we define an alias -typedef __m256i reg; +$$ +\begin{aligned} +f_n &= \sum_{i = 1}^{n - 1} \frac{ f_i \cdot i + f_{n - i} \cdot (n - i) }{ n \cdot (n - 1) } + 1 +\\ &= \frac{2}{n \cdot (n - 1)} \cdot \sum_{i = 1}^{n - 1} f_i \cdot i + 1 +\end{aligned} +$$ -// somewhere in the beginning of search loop: -reg x_vec = _mm256_set1_epi32(x); +To update $f_n$, we only need to calculate the sum of $f_i \cdot i$ for all $i < n$. To do that, let's introduce two new variables: -int cmp(reg x_vec, int* y_ptr) { - reg y_vec = _mm256_load_si256((reg*) y_ptr); - reg mask = _mm256_cmpgt_epi32(x_vec, y_vec); - return _mm256_movemask_ps((__m256) mask); -} -``` +$$ +g_n = f_n \cdot n, +\;\; +s_n = \sum_{i=1}^{n} g_n +$$ -After that, we call this function two times (because our node size / cache line happens to be 512 bits, which is twice as big) and blend these masks together with bitwise operations. +Now they can be sequentially calculated as: -## Final Implementation +$$ +\begin{aligned} +g_n &= f_n \cdot n + = \frac{2}{n-1} \cdot \sum_{i = 1}^{n - 1} g_i + n + = \frac{2}{n - 1} \cdot s_{n - 1} + n +\\ s_n &= s_{n - 1} + g_n +\end{aligned} +$$ -```cpp -#pragma GCC optimize("O3") -#pragma GCC target("avx2") +This way we get an $O(n)$ algorithm, but we can do even better. Let's substitute $g_n$ in the update formula for $s_n$: -#include -#include +$$ +\begin{aligned} +s_n &= s_{n - 1} + \frac{2}{n - 1} \cdot s_{n - 1} + n +\\ &= (1 + \frac{2}{n - 1}) \cdot s_{n - 1} + n +\\ &= \frac{n + 1}{n - 1} \cdot s_{n - 1} + n +\end{aligned} +$$ -using namespace std; + -typedef __m256i reg; +The next trick is more complicated. We define $r_n$ like this: -const int n = (1<<20), B = 16; -const int nblocks = (n + B - 1) / B; -const int INF = numeric_limits::max(); +$$ +\begin{aligned} +r_n &= \frac{s_n}{n} +\\ &= \frac{1}{n} \cdot \left(\frac{n + 1}{n - 1} \cdot s_{n - 1} + n\right) +\\ &= \frac{n + 1}{n} \cdot \frac{s_{n - 1}}{n - 1} + 1 +\\ &= \left(1 + \frac{1}{n}\right) \cdot r_{n - 1} + 1 +\end{aligned} +$$ -alignas(64) int btree[nblocks][B]; +We can substitute it into the formula we got for $g_n$ before: -int go(int k, int i) { return k * (B + 1) + i + 1; } +$$ +g_n = \frac{2}{n - 1} \cdot s_{n - 1} + n = 2 \cdot r_{n - 1} + n +$$ -void build(int k = 0) { - static int t = 0; - if (k < nblocks) { - for (int i = 0; i < B; i++) { - build(go(k, i)); - btree[k][i] = (t < n ? a[t++] : INF); - } - build(go(k, B)); - } -} +Recalling that $g_n = f_n \cdot n$, we can express $r_{n - 1}$ using $f_n$: -int cmp(reg x_vec, int* y_ptr) { - reg y_vec = _mm256_load_si256((reg*) y_ptr); - reg mask = _mm256_cmpgt_epi32(x_vec, y_vec); - return _mm256_movemask_ps((__m256) mask); -} +$$ +f_n \cdot n = 2 \cdot r_{n - 1} + n +\implies +r_{n - 1} = \frac{(f_n - 1) \cdot n}{2} +$$ -int search(int x) { - int k = 0, res = INF; - reg x_vec = _mm256_set1_epi32(x); - while (k < nblocks) { - int mask = ~( - cmp(x_vec, &btree[k][0]) + - (cmp(x_vec, &btree[k][8]) << 8) - ); - int i = __builtin_ffs(mask) - 1; - if (i < B) - res = btree[k][i]; - k = go(k, i); - } - return res; -} -``` +Final step. We've just expressed $r_n$ through $r_{n - 1}$ and $r_{n - 1}$ through $f_n$. This lets us express $f_{n + 1}$ through $f_n$: + +$$ +\begin{aligned} +&&\quad r_n &= \left(1 + \frac{1}{n}\right) \cdot r_{n - 1} + 1 +\\ &\Rightarrow & \frac{(f_{n + 1} - 1) \cdot (n + 1)}{2} &= \left(1 + \frac{1}{n}\right) \cdot \frac{(f_n - 1) \cdot n}{2} + 1 +\\ &&&= \frac{n + 1}{2} \cdot (f_n - 1) + 1 +\\ &\Rightarrow & (f_{n + 1} - 1) &= (f_{n} - 1) + \frac{2}{n + 1} +\\ &\Rightarrow &f_{n + 1} &= f_{n} + \frac{2}{n + 1} +\\ &\Rightarrow &f_{n} &= f_{n - 1} + \frac{2}{n} +\\ &\Rightarrow &f_{n} &= \sum_{k = 2}^{n} \frac{2}{k} +\end{aligned} +$$ + +The last expression is double the [harmonic series](https://en.wikipedia.org/wiki/Harmonic_series_(mathematics)), which is well known to approximate $\ln n$ as $n \to \infty$. Therefore, the random binary search will perform $\frac{2 \ln n}{\log_2 n} = 2 \ln 2 \approx 1.386$ more comparisons compared to the normal one. -That's it. This implementation should outperform even the [state-of-the-art indexes](http://kaldewey.com/pubs/FAST__SIGMOD10.pdf) used in high-performance databases, though it's mostly due to the fact that data structures used in real databases have to support fast updates while we don't. +### Acknowledgements -Note that this implementation is very specific to the architecture. Older CPUs and CPUs on mobile devices don't have 256-bit wide registers and will crash (but they likely have 128-bit SIMD so the loop can still be split in 4 parts instead of 2), non-Intel CPUs have their own instruction sets for SIMD, and some computers even have different cache line size. +The article is loosely based on "[Array Layouts for Comparison-Based Searching](https://arxiv.org/pdf/1509.05053.pdf)" by Paul-Virak Khuong and Pat Morin. It is 46 pages long and discusses these and many other (less successful) approaches in more detail. I highly recommend also checking it out — this is one of my favorite performance engineering papers. -## Acknowledgements +Thanks to Marshall Lochbaum for [providing](https://github.com/algorithmica-org/algorithmica/issues/57) the proof for the random binary search. No way I could do it myself. -This tutorial is loosely based on a [46-page paper](https://arxiv.org/pdf/1509.05053.pdf) by Paul-Virak Khuong and Pat Morin "Array layouts for comparison-based searching". +I also stole these lovely layout visualizations from some blog a long time ago, but I don't remember the name of the blog and what license they had, and inverse image search doesn't find them anymore. If you don't sue me, thank you, whoever you are! diff --git a/content/english/hpc/data-structures/filters.md b/content/english/hpc/data-structures/filters.md new file mode 100644 index 00000000..e8d38669 --- /dev/null +++ b/content/english/hpc/data-structures/filters.md @@ -0,0 +1,9 @@ +--- +title: Probabilistic Filters +weight: 10 +draft: true +--- + +bloom filters have the inverse behavior of caches* +- bloom filter: miss == definitely not present, hit == probably present +- cache: miss == probably not present, hit == definitely present diff --git a/content/english/hpc/data-structures/img/b-tree.jpg b/content/english/hpc/data-structures/img/b-tree.jpg new file mode 100644 index 00000000..c0c2117c Binary files /dev/null and b/content/english/hpc/data-structures/img/b-tree.jpg differ diff --git a/content/english/hpc/data-structures/img/bplus.png b/content/english/hpc/data-structures/img/bplus.png new file mode 100644 index 00000000..c1090668 Binary files /dev/null and b/content/english/hpc/data-structures/img/bplus.png differ diff --git a/content/english/hpc/data-structures/img/btree-absl.svg b/content/english/hpc/data-structures/img/btree-absl.svg new file mode 100644 index 00000000..4ed0a949 --- /dev/null +++ b/content/english/hpc/data-structures/img/btree-absl.svg @@ -0,0 +1,1344 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/btree-absolute.svg b/content/english/hpc/data-structures/img/btree-absolute.svg new file mode 100644 index 00000000..6709908f --- /dev/null +++ b/content/english/hpc/data-structures/img/btree-absolute.svg @@ -0,0 +1,1430 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/btree-relative.svg b/content/english/hpc/data-structures/img/btree-relative.svg new file mode 100644 index 00000000..e40210ff --- /dev/null +++ b/content/english/hpc/data-structures/img/btree-relative.svg @@ -0,0 +1,1505 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/eytzinger.png b/content/english/hpc/data-structures/img/eytzinger.png index 97237c73..901efdd2 100644 Binary files a/content/english/hpc/data-structures/img/eytzinger.png and b/content/english/hpc/data-structures/img/eytzinger.png differ diff --git a/content/english/hpc/data-structures/img/eytzinger_old.png b/content/english/hpc/data-structures/img/eytzinger_old.png new file mode 100644 index 00000000..97237c73 Binary files /dev/null and b/content/english/hpc/data-structures/img/eytzinger_old.png differ diff --git a/content/english/hpc/data-structures/img/fenwick-sum.png b/content/english/hpc/data-structures/img/fenwick-sum.png new file mode 100644 index 00000000..6a5ccd56 Binary files /dev/null and b/content/english/hpc/data-structures/img/fenwick-sum.png differ diff --git a/content/english/hpc/data-structures/img/fenwick-update.png b/content/english/hpc/data-structures/img/fenwick-update.png new file mode 100644 index 00000000..475a0b8d Binary files /dev/null and b/content/english/hpc/data-structures/img/fenwick-update.png differ diff --git a/content/english/hpc/data-structures/img/search-all.svg b/content/english/hpc/data-structures/img/search-all.svg new file mode 100644 index 00000000..e467869d --- /dev/null +++ b/content/english/hpc/data-structures/img/search-all.svg @@ -0,0 +1,1721 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-bplus-other.svg b/content/english/hpc/data-structures/img/search-bplus-other.svg new file mode 100644 index 00000000..d3316cb6 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-bplus-other.svg @@ -0,0 +1,1493 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-bplus.svg b/content/english/hpc/data-structures/img/search-bplus.svg new file mode 100644 index 00000000..9962b0e6 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-bplus.svg @@ -0,0 +1,1333 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-branchless-prefetch.svg b/content/english/hpc/data-structures/img/search-branchless-prefetch.svg new file mode 100644 index 00000000..2e3c46ef --- /dev/null +++ b/content/english/hpc/data-structures/img/search-branchless-prefetch.svg @@ -0,0 +1,1482 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-branchless.svg b/content/english/hpc/data-structures/img/search-branchless.svg new file mode 100644 index 00000000..6ddacb23 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-branchless.svg @@ -0,0 +1,1269 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-btree-hugepages.svg b/content/english/hpc/data-structures/img/search-btree-hugepages.svg new file mode 100644 index 00000000..d36d7869 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-btree-hugepages.svg @@ -0,0 +1,1274 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-btree-optimized.svg b/content/english/hpc/data-structures/img/search-btree-optimized.svg new file mode 100644 index 00000000..89a8472f --- /dev/null +++ b/content/english/hpc/data-structures/img/search-btree-optimized.svg @@ -0,0 +1,1465 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-btree.svg b/content/english/hpc/data-structures/img/search-btree.svg new file mode 100644 index 00000000..7f78b3ee --- /dev/null +++ b/content/english/hpc/data-structures/img/search-btree.svg @@ -0,0 +1,1620 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-eytzinger-branchless.svg b/content/english/hpc/data-structures/img/search-eytzinger-branchless.svg new file mode 100644 index 00000000..b0f430ce --- /dev/null +++ b/content/english/hpc/data-structures/img/search-eytzinger-branchless.svg @@ -0,0 +1,1254 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-eytzinger-prefetch.svg b/content/english/hpc/data-structures/img/search-eytzinger-prefetch.svg new file mode 100644 index 00000000..eabbab16 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-eytzinger-prefetch.svg @@ -0,0 +1,1618 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-eytzinger-small.svg b/content/english/hpc/data-structures/img/search-eytzinger-small.svg new file mode 100644 index 00000000..248410ae --- /dev/null +++ b/content/english/hpc/data-structures/img/search-eytzinger-small.svg @@ -0,0 +1,1098 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-eytzinger.svg b/content/english/hpc/data-structures/img/search-eytzinger.svg new file mode 100644 index 00000000..843fd491 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-eytzinger.svg @@ -0,0 +1,1439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-latency-bplus.svg b/content/english/hpc/data-structures/img/search-latency-bplus.svg new file mode 100644 index 00000000..92ea4a39 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-latency-bplus.svg @@ -0,0 +1,1511 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-random-relative.svg b/content/english/hpc/data-structures/img/search-random-relative.svg new file mode 100644 index 00000000..2b9cfa98 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-random-relative.svg @@ -0,0 +1,1178 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-random.svg b/content/english/hpc/data-structures/img/search-random.svg new file mode 100644 index 00000000..6376210b --- /dev/null +++ b/content/english/hpc/data-structures/img/search-random.svg @@ -0,0 +1,1286 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-relative-latency.svg b/content/english/hpc/data-structures/img/search-relative-latency.svg new file mode 100644 index 00000000..c1480666 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-relative-latency.svg @@ -0,0 +1,1208 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-relative.svg b/content/english/hpc/data-structures/img/search-relative.svg new file mode 100644 index 00000000..28cec67a --- /dev/null +++ b/content/english/hpc/data-structures/img/search-relative.svg @@ -0,0 +1,1179 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-set-relative-all.svg b/content/english/hpc/data-structures/img/search-set-relative-all.svg new file mode 100644 index 00000000..cd0e87a7 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-set-relative-all.svg @@ -0,0 +1,1553 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-set-relative.svg b/content/english/hpc/data-structures/img/search-set-relative.svg new file mode 100644 index 00000000..77a7cec0 --- /dev/null +++ b/content/english/hpc/data-structures/img/search-set-relative.svg @@ -0,0 +1,1281 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/search-std.svg b/content/english/hpc/data-structures/img/search-std.svg new file mode 100644 index 00000000..3486e33c --- /dev/null +++ b/content/english/hpc/data-structures/img/search-std.svg @@ -0,0 +1,1133 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-bottomup.svg b/content/english/hpc/data-structures/img/segtree-bottomup.svg new file mode 100644 index 00000000..44e7a12e --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-bottomup.svg @@ -0,0 +1,1551 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-branchless.svg b/content/english/hpc/data-structures/img/segtree-branchless.svg new file mode 100644 index 00000000..b149567a --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-branchless.svg @@ -0,0 +1,1854 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-fenwick-holes.svg b/content/english/hpc/data-structures/img/segtree-fenwick-holes.svg new file mode 100644 index 00000000..acb4c1e0 --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-fenwick-holes.svg @@ -0,0 +1,2006 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-fenwick.svg b/content/english/hpc/data-structures/img/segtree-fenwick.svg new file mode 100644 index 00000000..666a83c7 --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-fenwick.svg @@ -0,0 +1,1790 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-iterative.svg b/content/english/hpc/data-structures/img/segtree-iterative.svg new file mode 100644 index 00000000..9e501f2e --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-iterative.svg @@ -0,0 +1,1509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-layout.png b/content/english/hpc/data-structures/img/segtree-layout.png new file mode 100644 index 00000000..f2fdefca Binary files /dev/null and b/content/english/hpc/data-structures/img/segtree-layout.png differ diff --git a/content/english/hpc/data-structures/img/segtree-path.png b/content/english/hpc/data-structures/img/segtree-path.png new file mode 100644 index 00000000..44517df1 Binary files /dev/null and b/content/english/hpc/data-structures/img/segtree-path.png differ diff --git a/content/english/hpc/data-structures/img/segtree-permuted.png b/content/english/hpc/data-structures/img/segtree-permuted.png new file mode 100644 index 00000000..4d9273d7 Binary files /dev/null and b/content/english/hpc/data-structures/img/segtree-permuted.png differ diff --git a/content/english/hpc/data-structures/img/segtree-pointers.svg b/content/english/hpc/data-structures/img/segtree-pointers.svg new file mode 100644 index 00000000..1e713ef6 --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-pointers.svg @@ -0,0 +1,1369 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-popular-relative.svg b/content/english/hpc/data-structures/img/segtree-popular-relative.svg new file mode 100644 index 00000000..458fec35 --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-popular-relative.svg @@ -0,0 +1,2099 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-popular.svg b/content/english/hpc/data-structures/img/segtree-popular.svg new file mode 100644 index 00000000..9b650dd9 --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-popular.svg @@ -0,0 +1,2220 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-simd-others.svg b/content/english/hpc/data-structures/img/segtree-simd-others.svg new file mode 100644 index 00000000..c2054dbf --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-simd-others.svg @@ -0,0 +1,1992 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-simd.svg b/content/english/hpc/data-structures/img/segtree-simd.svg new file mode 100644 index 00000000..f71538d7 --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-simd.svg @@ -0,0 +1,1948 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-succinct.png b/content/english/hpc/data-structures/img/segtree-succinct.png new file mode 100644 index 00000000..9a70acb9 Binary files /dev/null and b/content/english/hpc/data-structures/img/segtree-succinct.png differ diff --git a/content/english/hpc/data-structures/img/segtree-topdown.svg b/content/english/hpc/data-structures/img/segtree-topdown.svg new file mode 100644 index 00000000..96239db0 --- /dev/null +++ b/content/english/hpc/data-structures/img/segtree-topdown.svg @@ -0,0 +1,1627 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/segtree-wide.png b/content/english/hpc/data-structures/img/segtree-wide.png new file mode 100644 index 00000000..bf268fc2 Binary files /dev/null and b/content/english/hpc/data-structures/img/segtree-wide.png differ diff --git a/content/english/hpc/data-structures/img/src/eytzinger.svg b/content/english/hpc/data-structures/img/src/eytzinger.svg new file mode 100644 index 00000000..da565f0d --- /dev/null +++ b/content/english/hpc/data-structures/img/src/eytzinger.svg @@ -0,0 +1,454 @@ + + + + + + + + + + + + + + + + 0 + 7 + + 2 + + + 1 + 3 + + 4 + + 8 + 5 + + 9 + 6 + + + + + 1 + 3 + + + 2 + + 4 + + 8 + 5 + + 9 + 6 + + 0 + 7 + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/src/fenwick-sum.svg b/content/english/hpc/data-structures/img/src/fenwick-sum.svg new file mode 100644 index 00000000..1a72235f --- /dev/null +++ b/content/english/hpc/data-structures/img/src/fenwick-sum.svg @@ -0,0 +1,1449 @@ + + + + Produced by OmniGraffle 6.6.2 2020-06-26 13:14:04 +0000 + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + Canvas 18 + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + 12 + + + + + 2 + + + + + 37 + + + + + -4 + + + + + 227 + + + + + 13 + + + + + 282 + + + + + 2 + + + + + -86 + + + + + -138 + + + + + 94 + + + + + 3 + + + + + 229 + + + + + 13 + + + + + -52 + + + + + 4 + + + 1 + + + 2 + + + 3 + + + 4 + + + 5 + + + 6 + + + 7 + + + 8 + + + 9 + + + 10 + + + 11 + + + 12 + + + 13 + + + 14 + + + 15 + + + 16 + + + + + 0 + + + 0 + + + + + 13 + + + + + 37 + + + + + 12 + + + + + 282 + + + + + 229 + + + + + 2 + + + + + -4 + + + + + 227 + + + + + 13 + + + + + 2 + + + + + -86 + + + + + -138 + + + + + -52 + + + + + 4 + + + + + 94 + + + + + 0 + + + + + + + + + + + + + + + + + 3 + + + + + + + diff --git a/content/english/hpc/data-structures/img/src/fenwick-update.svg b/content/english/hpc/data-structures/img/src/fenwick-update.svg new file mode 100644 index 00000000..394e041a --- /dev/null +++ b/content/english/hpc/data-structures/img/src/fenwick-update.svg @@ -0,0 +1,1406 @@ + + + + Produced by OmniGraffle 6.6.2 2020-06-26 13:14:04 +0000 + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + Canvas 19 + + Layer 1 + + + + + + + 3 + + + + + 94 + + + + + 282 + + + + + 4 + + + + + -86 + + + + + 2 + + + + + 13 + + + + + 227 + + + + + 37 + + + + + -4 + + + + + 2 + + + + + 12 + + + + + + + + + + + + + + + + 13 + + + + + + -138 + + + + + -52 + + + + + 229 + + + + + 12 + + + + + 2 + + + + + 37 + + + + + -4 + + + + + 227 + + + + + 13 + + + + + 282 + + + + + 2 + + + + + -86 + + + + + -138 + + + + + 94 + + + + + 3 + + + + + 229 + + + + + -52 + + + + + 4 + + + 1 + + + 2 + + + 3 + + + 4 + + + 5 + + + 6 + + + 7 + + + 8 + + + 9 + + + 10 + + + 11 + + + 12 + + + 13 + + + 14 + + + 15 + + + 16 + + + + + 0 + + + 0 + + + + + 13 + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/src/segtree-layout.svg b/content/english/hpc/data-structures/img/src/segtree-layout.svg new file mode 100644 index 00000000..257b831c --- /dev/null +++ b/content/english/hpc/data-structures/img/src/segtree-layout.svg @@ -0,0 +1,1064 @@ + + + + Produced by OmniGraffle 6.6.2 2020-06-26 13:14:04 +0000 + + + image/svg+xml + + + + + + + + + + + + + + + + + + Canvas 10 + + Layer 1 + + + + 13 + + + + + -1 + + + + + 2 + + + + + 23 + + + + + -4 + + + + + 231 + + + + + 13 + + + + + 5 + + + + + 2 + + + + + -88 + + + + + 0 + + + + + 90 + + + + + 3 + + + + + -12 + + + + + 4 + + + + + 12 + + + + + 25 + + + + + 227 + + + + + 18 + + + + + -86 + + + + + -9 + + + + + 37 + + + + + 245 + + + + + 282 + + + + + 85 + + + + + 94 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -52 + + + + + -52 + + + + + -138 + + + + + -53 + + + + + 229 + + + + diff --git a/content/english/hpc/data-structures/img/src/segtree-path.svg b/content/english/hpc/data-structures/img/src/segtree-path.svg new file mode 100644 index 00000000..b4f27e99 --- /dev/null +++ b/content/english/hpc/data-structures/img/src/segtree-path.svg @@ -0,0 +1,1786 @@ + + + + Produced by OmniGraffle 6.6.2 2020-06-26 13:14:04 +0000 + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + Canvas 5 + + Layer 1 + + + + 13 + + + + + -1 + + + + + 2 + + + + + 23 + + + + + -4 + + + + + 231 + + + + + 13 + + + + + 5 + + + + + 2 + + + + + -88 + + + + + 0 + + + + + 90 + + + + + 3 + + + + + -12 + + + 0 + + + 1 + + + 2 + + + 3 + + + 4 + + + 5 + + + 6 + + + 7 + + + 8 + + + 9 + + + 10 + + + 11 + + + 12 + + + 13 + + + 14 + + + 15 + + + + + 12 + + + + + 25 + + + + + 227 + + + + + 18 + + + + + -86 + + + + + -52 + + + + + -9 + + + + + + + + + + + + + + + + + + + + + 37 + + + + + 245 + + + + + -138 + + + + + + + + + + + + + 282 + + + + + -53 + + + + + 229 + + + + + + + + + + + -52 + + + + + 4 + + + + + 85 + + + + + 94 + + + [0,1] + + + [2,3] + + + [4,5] + + + [6,7] + + + [8,9] + + + [10,11] + + + [12,13] + + + [14,15] + + + [0,3] + + + [4,7] + + + [8,11] + + + [12,15] + + + [8,15] + + + [0,7] + + + [0,15] + + + + diff --git a/content/english/hpc/data-structures/img/src/segtree-permuted.svg b/content/english/hpc/data-structures/img/src/segtree-permuted.svg new file mode 100644 index 00000000..5ef98c2b --- /dev/null +++ b/content/english/hpc/data-structures/img/src/segtree-permuted.svg @@ -0,0 +1,1621 @@ + + + + Produced by OmniGraffle 6.6.2 2020-06-26 13:14:04 +0000 + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Canvas 5 + + Layer 1 + + + + 23 + + + + + -4 + + + + + 231 + + + + + 13 + + + + + 5 + + + + + 2 + + + + + -88 + + + + -52 + + + + + 0 + + + + + 4 + + + + + + 3 + + + 4 + + + 5 + + + 6 + + + 7 + + + 8 + + + 9 + + + 10 + + + 11 + + + 12 + + + + + 19 + + + + + 244 + + + + + 7 + + + + + -140 + + + + 4 + + + + + + 13 + + + + + 2 + + + + + + + + + + + + + + + + 263 + + + + + -133 + + + + + 17 + + + + + + + + + + + + + 130 + + + + + 18 + + + + + 148 + + + + + + + + + + + + + 1 + + + + + -1 + + + [3,4] + + + [5,6] + + + [7,8] + + + [9,10] + + + [11,12] + + + 0* + + + 1* + + + 2* + + + [3,6] + + + [7,10] + + + [11,0]* + + + [1,2] + + + [11,2]* + + + [3,10] + + + [3,2]* + + + + diff --git a/content/english/hpc/data-structures/img/src/segtree-succinct.svg b/content/english/hpc/data-structures/img/src/segtree-succinct.svg new file mode 100644 index 00000000..4b707802 --- /dev/null +++ b/content/english/hpc/data-structures/img/src/segtree-succinct.svg @@ -0,0 +1,2086 @@ + + + + Produced by OmniGraffle 6.6.2 2020-06-26 13:14:04 +0000 + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + Canvas 5 + + Layer 1 + + + + 13 + + + + + -1 + + + + + 2 + + + + + 23 + + + + + -4 + + + + + 231 + + + + + 13 + + + + + 5 + + + + + 2 + + + + + -88 + + + + + 0 + + + + + 90 + + + + + 3 + + + + + -12 + + + 0 + + + 1 + + + 2 + + + 3 + + + 4 + + + 5 + + + 6 + + + 7 + + + 8 + + + 9 + + + 10 + + + 11 + + + 12 + + + 13 + + + 14 + + + 15 + + + + + 12 + + + + + 25 + + + + + 227 + + + + + 18 + + + + + -86 + + + + + + + + + + + + + -52 + + + + + -9 + + + + + + + + + + + + + + + + + + + + + 37 + + + + + 245 + + + + + -138 + + + + + + + + + + + + + 282 + + + + + -53 + + + + + 229 + + + + + + + + + + + -52 + + + + + 4 + + + + + 85 + + + + + 94 + + + [0,1] + + + [2,3] + + + [4,5] + + + [6,7] + + + [8,9] + + + [10,11] + + + [12,13] + + + [14,15] + + + [0,3] + + + [4,7] + + + [8,11] + + + [12,15] + + + [8,15] + + + [0,7] + + + [0,15] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/img/src/segtree-wide.svg b/content/english/hpc/data-structures/img/src/segtree-wide.svg new file mode 100644 index 00000000..1d38e472 --- /dev/null +++ b/content/english/hpc/data-structures/img/src/segtree-wide.svg @@ -0,0 +1,1696 @@ + + + + Produced by OmniGraffle 6.6.2 2020-06-26 13:14:04 +0000 + + + image/svg+xml + + + + + + + Canvas 13 + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/data-structures/s-tree.md b/content/english/hpc/data-structures/s-tree.md new file mode 100644 index 00000000..875f72ec --- /dev/null +++ b/content/english/hpc/data-structures/s-tree.md @@ -0,0 +1,622 @@ +--- +title: Static B-Trees +weight: 2 +--- + +This section is a follow-up to the [previous one](../binary-search), where we optimized binary search by the means of removing branching and improving the memory layout. Here, we will also be searching in sorted arrays, but this time we are not limited to fetching and comparing only one element at a time. + +In this section, we generalize the techniques we developed for binary search to *static B-trees* and accelerate them further using [SIMD instructions](/hpc/simd). In particular, we develop two new implicit data structures: + +- The [first](#b-tree-layout) is based on the memory layout of a B-tree, and, depending on the array size, it is up to 8x faster than `std::lower_bound` while using the same space as the array and only requiring a permutation of its elements. +- The [second](#b-tree-layout-1) is based on the memory layout of a B+ tree, and it is up to 15x faster than `std::lower_bound` while using just 6-7% more memory — or 6-7% **of** the memory if we can keep the original sorted array. + +To distinguish them from B-trees — the structures with pointers, hundreds to thousands of keys per node, and empty spaces in them — we will use the names *S-tree* and *S+ tree* respectively to refer to these particular memory layouts[^name]. + +[^name]: [Similar to B-trees](https://en.wikipedia.org/wiki/B-tree#Origin), "the more you think about what the S in S-trees means, the better you understand S-trees." + + + +To the best of my knowledge, this is a significant improvement over the existing [approaches](http://kaldewey.com/pubs/FAST__SIGMOD10.pdf). As before, we are using Clang 10 targeting a Zen 2 CPU, but the performance improvements should approximately transfer to most other platforms, including Arm-based chips. Use [this single-source benchmark](https://github.com/sslotin/amh-code/blob/main/binsearch/standalone.cc) of the final implementation if you want to test it on your machine. + +This is a long article, and since it also serves as a [textbook](/hpc/) case study, we will improve the algorithm incrementally for pedagogical goals. If you are already an expert and feel comfortable reading [intrinsic](/hpc/simd/intrinsics)-heavy code with little to no context, you can jump straight to the [final implementation](#implicit-b-tree-1). + +## B-Tree Layout + +B-trees generalize the concept of binary search trees by allowing nodes to have more than two children. Instead of a single key, a node of a B-tree of order $k$ can contain up to $B = (k - 1)$ keys stored in sorted order and up to $k$ pointers to child nodes. Each child $i$ satisfies the property that all keys in its subtree are between keys $(i - 1)$ and $i$ of the parent node (if they exist). + +![A B-tree of order 4](../img/b-tree.jpg) + +The main advantage of this approach is that it reduces the tree height by $\frac{\log_2 n}{\log_k n} = \frac{\log k}{\log 2} = \log_2 k$ times, while fetching each node still takes roughly the same time — as long it fits into a single [memory block](/hpc/external-memory/hierarchy/). + +B-trees were primarily developed for the purpose of managing on-disk databases, where the latency of randomly fetching a single byte is comparable with the time it takes to read the next 1MB of data sequentially. For our use case, we will be using the block size of $B = 16$ elements — or $64$ bytes, the size of the cache line — which makes the tree height and the total number of cache line fetches per query $\log_2 17 \approx 4$ times smaller compared to the binary search. + +### Implicit B-Tree + +Storing and fetching pointers in a B-tree node wastes precious cache space and decreases performance, but they are essential for changing the tree structure on inserts and deletions. But when there are no updates and the structure of a tree is *static*, we can get rid of the pointers, which makes the structure *implicit*. + +One of the ways to achieve this is by generalizing the [Eytzinger numeration](../binary-search#eytzinger-layout) to $(B + 1)$-ary trees: + +- The root node is numbered $0$. +- Node $k$ has $(B + 1)$ child nodes numbered $\\{k \cdot (B + 1) + i + 1\\}$ for $i \in [0, B]$. + +This way, we can only use $O(1)$ additional memory by allocating one large two-dimensional array of keys and relying on index arithmetic to locate children nodes in the tree: + +```c++ +const int B = 16; + +int nblocks = (n + B - 1) / B; +int btree[nblocks][B]; + +int go(int k, int i) { return k * (B + 1) + i + 1; } +``` + + + +This numeration automatically makes the B-tree complete or almost complete with the height of $\Theta(\log_{B + 1} n)$. If the length of the initial array is not a multiple of $B$, the last block is padded with the largest value of its data type. + +### Construction + +We can construct the B-tree similar to how we constructed the Eytzinger array — by traversing the search tree: + +```c++ +void build(int k = 0) { + static int t = 0; + if (k < nblocks) { + for (int i = 0; i < B; i++) { + build(go(k, i)); + btree[k][i] = (t < n ? a[t++] : INT_MAX); + } + build(go(k, B)); + } +} +``` + +It is correct because each value of the initial array will be copied to a unique position in the resulting array, and the tree height is $\Theta(\log_{B+1} n)$ because $k$ is multiplied by $(B + 1)$ each time we descend into a child node. + +Note that this numeration causes a slight imbalance: left-er children may have larger subtrees, although this is only true for $O(\log_{B+1} n)$ parent nodes. + +### Searches + +To find the lower bound, we need to fetch the $B$ keys in a node, find the first key $a_i$ not less than $x$, descend to the $i$-th child — and continue until we reach a leaf node. There is some variability in how to find that first key. For example, we could do a tiny internal binary search that makes $O(\log B)$ iterations, or maybe just compare each key sequentially in $O(B)$ time until we find the local lower bound, hopefully exiting from the loop a bit early. + +But we are not going to do that — because we can use [SIMD](/hpc/simd). It doesn't work well with branching, so essentially what we want to do is to compare against all $B$ elements regardless, compute a bitmask out of these comparisons, and then use the `ffs` instruction to find the bit corresponding to the first non-lesser element: + +```cpp +int mask = (1 << B); + +for (int i = 0; i < B; i++) + mask |= (btree[k][i] >= x) << i; + +int i = __builtin_ffs(mask) - 1; +// now i is the number of the correct child node +``` + +Unfortunately, the compilers are not smart enough to [auto-vectorize](/hpc/simd/auto-vectorization/) this code yet, so we have to optimize it manually. In AVX2, we can load 8 elements, compare them against the search key, producing a [vector mask](/hpc/simd/masking/), and then extract the scalar mask from it with `movemask`. Here is a minimized illustrated example of what we want to do: + +```center + y = 4 17 65 103 + x = 42 42 42 42 + y ≥ x = 00000000 00000000 11111111 11111111 + ├┬┬┬─────┴────────┴────────┘ +movemask = 0011 + ┌─┘ + ffs = 3 +``` + +Since we are limited to processing 8 elements at a time (half our block / cache line size), we have to split the elements into two groups and then combine the two 8-bit masks. To do this, it will be slightly easier to swap the condition for `x > y` and compute the inverted mask instead: + +```c++ +typedef __m256i reg; + +int cmp(reg x_vec, int* y_ptr) { + reg y_vec = _mm256_load_si256((reg*) y_ptr); // load 8 sorted elements + reg mask = _mm256_cmpgt_epi32(x_vec, y_vec); // compare against the key + return _mm256_movemask_ps((__m256) mask); // extract the 8-bit mask +} +``` + +Now, to process the entire block, we need to call it twice and combine the masks: + +```c++ +int mask = ~( + cmp(x, &btree[k][0]) + + (cmp(x, &btree[k][8]) << 8) +); +``` + +To descend down the tree, we use `ffs` on that mask to get the correct child number and just call the `go` function we defined earlier: + +```c++ +int i = __builtin_ffs(mask) - 1; +k = go(k, i); +``` + +To actually return the result in the end, we'd want to just fetch `btree[k][i]` in the last node we visited, but the problem is that sometimes the local lower bound doesn't exist ($i \ge B$) because $x$ happens to be greater than all the keys in the node. We could, in theory, do the same thing we did for the [Eytzinger binary search](../binary-search/#search-implementation) and restore the correct element *after* we calculate the last index, but we don't have a nice bit trick this time and have to do a lot of [divisions by 17](/hpc/arithmetic/division) to compute it, which will be slow and almost certainly not worth it. + +Instead, we can just remember and return the last local lower bound we encountered when we descended the tree: + +```c++ +int lower_bound(int _x) { + int k = 0, res = INT_MAX; + reg x = _mm256_set1_epi32(_x); + while (k < nblocks) { + int mask = ~( + cmp(x, &btree[k][0]) + + (cmp(x, &btree[k][8]) << 8) + ); + int i = __builtin_ffs(mask) - 1; + if (i < B) + res = btree[k][i]; + k = go(k, i); + } + return res; +} +``` + +This implementation outperforms all previous binary search implementations, and by a huge margin: + +![](../img/search-btree.svg) + +This is very good — but we can optimize it even further. + +### Optimization + +Before everything else, let's allocate the memory for the array on a [hugepage](/hpc/cpu-cache/paging): + +```c++ +const int P = 1 << 21; // page size in bytes (2MB) +const int T = (64 * nblocks + P - 1) / P * P; // can only allocate whole number of pages +btree = (int(*)[16]) std::aligned_alloc(P, T); +madvise(btree, T, MADV_HUGEPAGE); +``` + +This slightly improves the performance on larger array sizes: + +![](../img/search-btree-hugepages.svg) + +Ideally, we'd also need to enable hugepages for all [previous implementations](../binary-search) to make the comparison fair, but it doesn't matter that much because they all have some form of prefetching that alleviates this problem. + +With that settled, let's begin real optimization. First of all, we'd want to use compile-time constants instead of variables as much as possible because it lets the compiler embed them in the machine code, unroll loops, optimize arithmetic, and do all sorts of other nice stuff for us for free. Specifically, we want to know the tree height in advance: + + + +```c++ +constexpr int height(int n) { + // grow the tree until its size exceeds n elements + int s = 0, // total size so far + l = B, // size of the next layer + h = 0; // height so far + while (s + l - B < n) { + s += l; + l *= (B + 1); + h++; + } + return h; +} + +const int H = height(N); +``` + + + +Next, we can find the local lower bound in nodes faster. Instead of calculating it separately for two 8-element blocks and merging two 8-bit masks, we combine the vector masks using the [packs](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=3037,4870,6715,4845,3853,90,7307,5993,2692,6946,6949,5456,6938,5456,1021,3007,514,518,7253,7183,3892,5135,5260,3915,4027,3873,7401,4376,4229,151,2324,2310,2324,4075,6130,4875,6385,5259,6385,6250,1395,7253,6452,7492,4669,4669,7253,1039,1029,4669,4707,7253,7242,848,879,848,7251,4275,879,874,849,833,6046,7250,4870,4872,4875,849,849,5144,4875,4787,4787,4787,5227,7359,7335,7392,4787,5259,5230,5223,6438,488,483,6165,6570,6554,289,6792,6554,5230,6385,5260,5259,289,288,3037,3009,590,604,5230,5259,6554,6554,5259,6547,6554,3841,5214,5229,5260,5259,7335,5259,519,1029,515,3009,3009,3011,515,6527,652,6527,6554,288,3841,5230,5259,5230,5259,305,5259,591,633,633,5259,5230,5259,5259,3017,3018,3037,3018,3017,3016,3013,5144&text=_mm256_packs_epi32&techs=AVX,AVX2) instruction and readily extract it using `movemask` just once: + +```c++ +unsigned rank(reg x, int* y) { + reg a = _mm256_load_si256((reg*) y); + reg b = _mm256_load_si256((reg*) (y + 8)); + + reg ca = _mm256_cmpgt_epi32(a, x); + reg cb = _mm256_cmpgt_epi32(b, x); + + reg c = _mm256_packs_epi32(ca, cb); + int mask = _mm256_movemask_epi8(c); + + // we need to divide the result by two because we call movemask_epi8 on 16-bit masks: + return __tzcnt_u32(mask) >> 1; +} +``` + +This instruction converts 32-bit integers stored in two registers to 16-bit integers stored in one register — in our case, effectively joining the vector masks into one. Note that we've swapped the order of comparison — this lets us not invert the mask in the end, but we have to subtract[^float] one from the search key once in the beginning to make it correct (otherwise, it works as `upper_bound`). + +[^float]: If you need to work with [floating-point](/hpc/arithmetic/float) keys, consider whether `upper_bound` will suffice — because if you need `lower_bound` specifically, then subtracting one or the machine epsilon from the search key doesn't work: you need to [get the previous representable number](https://stackoverflow.com/questions/10160079/how-to-find-nearest-next-previous-double-value-numeric-limitsepsilon-for-give) instead. Aside from some corner cases, this essentially means reinterpreting its bits as an integer, subtracting one, and reinterpreting it back as a float (which magically works because of how [IEEE-754 floating-point numbers](/hpc/arithmetic/ieee-754) are stored in memory). + +The problem is, it does this weird interleaving where the result is written in the `a1 b1 a2 b2` order instead of `a1 a2 b1 b2` that we want — many AVX2 instructions tend to do that. To correct this, we need to [permute](/hpc/simd/shuffling) the resulting vector, but instead of doing it during the query time, we can just permute every node during preprocessing: + +```c++ +void permute(int *node) { + const reg perm = _mm256_setr_epi32(4, 5, 6, 7, 0, 1, 2, 3); + reg* middle = (reg*) (node + 4); + reg x = _mm256_loadu_si256(middle); + x = _mm256_permutevar8x32_epi32(x, perm); + _mm256_storeu_si256(middle, x); +} +``` + +Now we just call `permute(&btree[k])` right after we are done building the node. There are probably faster ways to swap the middle elements, but we will leave it here as the preprocessing time is not that important for now. + +This new SIMD routine is significantly faster because the extra `movemask` is slow, and also blending the two masks takes quite a few instructions. Unfortunately, we now can't just do the `res = btree[k][i]` update anymore because the elements are permuted. We can solve this problem with some bit-level trickery in terms of `i`, but indexing a small lookup table turns out to be faster and also doesn't require a new branch: + +```c++ +const int translate[17] = { + 0, 1, 2, 3, + 8, 9, 10, 11, + 4, 5, 6, 7, + 12, 13, 14, 15, + 0 +}; + +void update(int &res, int* node, unsigned i) { + int val = node[translate[i]]; + res = (i < B ? val : res); +} +``` + +This `update` procedure takes some time, but it's not on the critical path between the iterations, so it doesn't affect the actual performance that much. + +Stitching it all together (and leaving out some other minor optimizations): + +```c++ +int lower_bound(int _x) { + int k = 0, res = INT_MAX; + reg x = _mm256_set1_epi32(_x - 1); + for (int h = 0; h < H - 1; h++) { + unsigned i = rank(x, &btree[k]); + update(res, &btree[k], i); + k = go(k, i); + } + // the last branch: + if (k < nblocks) { + unsigned i = rank(x, btree[k]); + update(res, &btree[k], i); + } + return res; +} +``` + +All this work saved us 15-20% or so: + +![](../img/search-btree-optimized.svg) + +It doesn't feel very satisfying so far, but we will reuse these optimization ideas later. + +There are two main problems with the current implementation: + +- The `update` procedure is quite costly, especially considering that it is very likely going to be useless: 16 out of 17 times, we can just fetch the result from the last block. +- We do a non-constant number of iterations, causing branch prediction problems similar to how it did for the [Eytzinger binary search](../binary-search/#removing-the-last-branch); you can also see it on the graph this time, but the latency bumps have a period of $2^4$. + +To address these problems, we need to change the layout a little bit. + +## B+ Tree Layout + +Most of the time, when people talk about B-trees, they really mean *B+ trees*, which is a modification that distinguishes between the two types of nodes: + +- *Internal nodes* store up to $B$ keys and $(B + 1)$ pointers to child nodes. The key number $i$ is always equal to the smallest key in the subtree of the $(i + 1)$-th child node. +- *Data nodes* or *leaves* store up to $B$ keys, the pointer to the next leaf node, and, optionally, an associated value for each key — if the structure is used as a key-value map. + +The advantages of this approach include faster search time (as the internal nodes only store keys) and the ability to quickly iterate over a range of entries (by following next leaf node pointers), but this comes at the cost of some memory overhead: we have to store copies of keys in the internal nodes. + +![A B+ tree of order 4](../img/bplus.png) + +Back to our use case, this layout can help us solve our two problems: + +- Either the last node we descend into has the local lower bound, or it is the first key of the next leaf node, so we don't need to call `update` on each iteration. +- The depth of all leaves is constant because B+ trees grow at the root and not at the leaves, which removes the need for branching. + +The disadvantage is that this layout is not *succinct*: we need some additional memory to store the internal nodes — about $\frac{1}{16}$-th of the original array size, to be exact — but the performance improvement will be more than worth it. + +### Implicit B+ Tree + +To be more explicit with pointer arithmetic, we will store the entire tree in a single one-dimensional array. To minimize index computations during run time, we will store each layer sequentially in this array and use compile time computed offsets to address them: the keys of the node number `k` on layer `h` start with `btree[offset(h) + k * B]`, and its `i`-th child will at `btree[offset(h - 1) + (k * (B + 1) + i) * B]`. + +To implement all that, we need slightly more `constexpr` functions: + +```c++ +// number of B-element blocks in a layer with n keys +constexpr int blocks(int n) { + return (n + B - 1) / B; +} + +// number of keys on the layer previous to one with n keys +constexpr int prev_keys(int n) { + return (blocks(n) + B) / (B + 1) * B; +} + +// height of a balanced n-key B+ tree +constexpr int height(int n) { + return (n <= B ? 1 : height(prev_keys(n)) + 1); +} + +// where the layer h starts (layer 0 is the largest) +constexpr int offset(int h) { + int k = 0, n = N; + while (h--) { + k += blocks(n) * B; + n = prev_keys(n); + } + return k; +} + +const int H = height(N); +const int S = offset(H); // the tree size is the offset of the (non-existent) layer H + +int *btree; // the tree itself is stored in a single hugepage-aligned array of size S +``` + +Note that we store the layers in reverse order, but the nodes within a layer and data in them are still left-to-right, and also the layers are numbered bottom-up: the leaves form the zeroth layer, and the root is the layer `H - 1`. These are just arbitrary decisions — it is just slightly easier to implement in code. + +### Construction + +To construct the tree from a sorted array `a`, we first need to copy it into the zeroth layer and pad it with infinities: + +```c++ +memcpy(btree, a, 4 * N); + +for (int i = N; i < S; i++) + btree[i] = INT_MAX; +``` + +Now we build the internal nodes, layer by layer. For each key, we need to descend to the right of it in, always go left until we reach a leaf node, and then take its first key — it will be the smallest on the subtree: + +```c++ +for (int h = 1; h < H; h++) { + for (int i = 0; i < offset(h + 1) - offset(h); i++) { + // i = k * B + j + int k = i / B, + j = i - k * B; + k = k * (B + 1) + j + 1; // compare to the right of the key + // and then always to the left + for (int l = 0; l < h - 1; l++) + k *= (B + 1); + // pad the rest with infinities if the key doesn't exist + btree[offset(h) + i] = (k * B < N ? btree[k * B] : INT_MAX); + } +} +``` + +And just the finishing touch — we need to permute keys in internal nodes to search them faster: + +```c++ +for (int i = offset(1); i < S; i += B) + permute(btree + i); +``` + +We start from `offset(1)`, and we specifically don't permute leaf nodes and leave the array in the original sorted order. The motivation is that we'd need to do this complex index translation we do in `update` if the keys were permuted, and it is on the critical path when this is the last operation. So, just for this layer, we switch to the original mask-blending local lower bound procedure. + +### Searching + +The search procedure becomes simpler than for the B-tree layout: we don't need to do `update` and only execute a fixed number of iterations — although the last one with some special treatment: + +```c++ +int lower_bound(int _x) { + unsigned k = 0; // we assume k already multiplied by B to optimize pointer arithmetic + reg x = _mm256_set1_epi32(_x - 1); + for (int h = H - 1; h > 0; h--) { + unsigned i = permuted_rank(x, btree + offset(h) + k); + k = k * (B + 1) + i * B; + } + unsigned i = direct_rank(x, btree + k); + return btree[k + i]; +} +``` + +Switching to the B+ layout more than paid off: the S+ tree is 1.5-3x faster compared to the optimized S-tree: + +![](../img/search-bplus.svg) + +The spikes at the high end of the graph are caused by the L1 TLB not being large enough: it has 64 entries, so it can handle at most 64 × 2 = 128MB of data, which is exactly what is required for storing `2^25` integers. The S+ tree hits this limit slightly sooner because of the ~7% memory overhead. + +### Comparison with `std::lower_bound` + +We've come a long way from binary search: + +![](../img/search-all.svg) + +On these scales, it makes more sense to look at the relative speedup: + +![](../img/search-relative.svg) + +The cliffs at the beginning of the graph are because the running time of `std::lower_bound` grows smoothly with the array size, while for an S+ tree, it is locally flat and increases in discrete steps when a new layer needs to be added. + +One important asterisk we haven't discussed is that what we are measuring is not real latency, but the *reciprocal throughput* — the total time it takes to execute a lot of queries divided by the number of queries: + +```c++ +clock_t start = clock(); + +for (int i = 0; i < m; i++) + checksum ^= lower_bound(q[i]); + +float seconds = float(clock() - start) / CLOCKS_PER_SEC; +printf("%.2f ns per query\n", 1e9 * seconds / m); +``` + +To measure *actual* latency, we need to introduce a dependency between the loop iterations so that the next query can't start before the previous one finishes: + +```c++ +int last = 0; + +for (int i = 0; i < m; i++) { + last = lower_bound(q[i] ^ last); + checksum ^= last; +} +``` + +In terms of real latency, the speedup is not that impressive: + +![](../img/search-relative-latency.svg) + +A lot of the performance boost of the S+ tree comes from removing branching and minimizing memory requests, which allows overlapping the execution of more adjacent queries — apparently, around three on average. + + + +Although nobody except maybe the HFT people cares about real latency, and everybody actually measures throughput even when using the word "latency," this nuance is still something to take into account when predicting the possible speedup in user applications. + +### Modifications and Further Optimizations + + + +To minimize the number of memory accesses during a query, we can increase the block size. To find the local lower bound in a 32-element node (spanning two cache lines and four AVX2 registers), we can use a [similar trick](https://github.com/sslotin/amh-code/blob/a74495a2c19dddc697f94221629c38fee09fa5ee/binsearch/bplus32.cc#L94) that uses two `packs_epi32` and one `packs_epi16` to combine masks. + +We can also try to use the cache more efficiently by controlling where each tree layer is stored in the cache hierarchy. We can do that by prefetching nodes to a [specific level](/hpc/cpu-cache/prefetching/#software-prefetching) and using [non-temporal reads](/hpc/cpu-cache/bandwidth/#bypassing-the-cache) during queries. + +I implemented two versions of these optimizations: the one with a block size of 32 and the one where the last read is non-temporal. They don't improve the throughput: + +![](../img/search-bplus-other.svg) + +…but they do make the latency lower: + +![](../img/search-latency-bplus.svg) + +Ideas that I have not yet managed to implement but consider highly perspective are: + +- Make the block size non-uniform. The motivation is that the slowdown from having one 32-element layer is less than from having two separate layers. Also, the root is often not full, so perhaps sometimes it should have only 8 keys or even just one key. Picking the optimal layer configuration for a given array size should remove the spikes from the relative speedup graph and make it look more like its upper envelope. + + I know how to do it with code generation, but I went for a generic solution and tried to [implement]( +https://github.com/sslotin/amh-code/blob/main/binsearch/bplus-adaptive.cc) it with the facilities of modern C++, but the compiler can't produce optimal code this way. +- Group nodes with one or two generations of its descendants (~300 nodes / ~5k keys) so that they are close in memory — in the spirit of what [FAST](http://kaldewey.com/pubs/FAST__SIGMOD10.pdf) calls hierarchical blocking. This reduces the severity of TLB misses and also may improve the latency as the memory controller may choose to keep the [RAM row buffer](/hpc/cpu-cache/aos-soa/#ram-specific-timings) open, anticipating local reads. +- Optionally use prefetching on some specific layers. Aside from to the $\frac{1}{17}$-th chance of it fetching the node we need, the hardware prefetcher may also get some of its neighbors for us if the data bus is not busy. It also has the same TLB and row buffer effects as with blocking. + +Other possible minor optimizations include: + +- Permuting the nodes of the last layer as well — if we only need the index and not the value. +- Reversing the order in which the layers are stored to left-to-right so that the first few layers are on the same page. +- Rewriting the whole thing in assembly, as the compiler seems to struggle with pointer arithmetic. +- Using [blending](/hpc/simd/masking) instead of `packs`: you can odd-even shuffle node keys (`[1 3 5 7] [2 4 6 8]`), compare against the search key, and then blend the low 16 bits of the first register mask with the high 16 bits of the second. Blending is slightly faster on many architectures, and it may also help to alternate between packing and blending as they use different subsets of ports. (Thanks to Const-me from HackerNews for [suggesting](https://news.ycombinator.com/item?id=30381912) it.) +- Using [popcount](/hpc/simd/shuffling/#shuffles-and-popcount) instead of `tzcnt`: the index `i` is equal to the number of keys less than `x`, so we can compare `x` against all keys, combine the vector mask any way we want, call `maskmov`, and then calculate the number of set bits with `popcnt`. This removes the need to store the keys in any particular order, which lets us skip the permutation step and also use this procedure on the last layer as well. +- Defining the key $i$ as the *maximum* key in the subtree of child $i$ instead of the *minimum* key in the subtree of child $(i + 1)$. The correctness doesn't change, but this guarantees that the result will be stored in the last node we access (and not in the first element of the next neighbor node), which lets us fetch slightly fewer cache lines. + +Note that the current implementation is specific to AVX2 and may require some non-trivial changes to adapt to other platforms. It would be interesting to port it for Intel CPUs with AVX-512 and Arm CPUs with 128-bit NEON, which may require some [trickery](https://github.com/WebAssembly/simd/issues/131) to work. + + + +With these optimizations implemented, I wouldn't be surprised to see another 10-30% improvement and over 10x speedup over `std::lower_bound` on large arrays for some platforms. + +### As a Dynamic Tree + +The comparison is even more favorable against `std::set` and other pointer-based trees. In our benchmark, we add the same elements (without measuring the time it takes to add them) and use the same lower bound queries, and the S+ tree is up to 30x faster: + +![](../img/search-set-relative.svg) + +This suggests that we can probably use this approach to also improve on *dynamic* search trees by a large margin. + +To validate this hypothesis, I added an array of 17 indices for each node that point to where their children should be and used this array to descend the tree instead of the usual implicit numbering. This array is separate from the tree, not aligned, and isn't even on a hugepage — the only optimization we do is prefetch the first and the last pointer of a node. + +I also added [B-tree from Abseil](https://abseil.io/blog/20190812-btree) to the comparison, which is the only widely-used B-tree implementation I know of. It performs just slightly better than `std::lower_bound`, while the S+ tree with pointers is ~15x faster for large arrays: + + + +![](../img/search-set-relative-all.svg) + +Of course, this comparison is not fair, as implementing a dynamic search tree is a more high-dimensional problem. + +We'd also need to implement the update operation, which will not be that efficient, and for which we'd need to sacrifice the fanout factor. But it still seems possible to implement a 10-20x faster `std::set` and a 3-5x faster `absl::btree_set`, depending on how you define "faster" — and this is one of the things we'll [attempt to do next](../b-tree). + + + + +### Acknowledgements + +This [StackOverflow answer](https://stackoverflow.com/questions/20616605/using-simd-avx-sse-for-tree-traversal) by Cory Nelson is where I took the permuted 16-element search trick from. + + + + diff --git a/content/english/hpc/data-structures/segment-trees.md b/content/english/hpc/data-structures/segment-trees.md new file mode 100644 index 00000000..9ad14608 --- /dev/null +++ b/content/english/hpc/data-structures/segment-trees.md @@ -0,0 +1,756 @@ +--- +title: Segment Trees +weight: 4 +published: true +--- + +The lessons learned from [optimizing](../s-tree) [binary search](../binary-search) can be applied to a broad range of data structures. + +In this article, instead of trying to optimize something from the STL again, we focus on *segment trees*, the structures that may be unfamiliar to most *normal* programmers and perhaps even most computer science researchers[^tcs], but that are used [very extensively](https://www.google.com/search?q=segment+tree+site%3Acodeforces.com&newwindow=1&sxsrf=APq-WBuTupSOnSn9JNEHhaqtmv0Uq0eogQ%3A1645969931499&ei=C4IbYrb2HYibrgS9t6qgDQ&ved=0ahUKEwj2p8_og6D2AhWIjYsKHb2bCtQQ4dUDCA4&uact=5&oq=segment+tree+site%3Acodeforces.com&gs_lcp=Cgdnd3Mtd2l6EAM6BwgAEEcQsAM6BwgAELADEEM6BAgjECc6BAgAEEM6BQgAEIAEOgYIABAWEB46BQghEKABSgQIQRgASgQIRhgAUMkFWLUjYOgkaANwAXgAgAHzAYgB9A-SAQYxNS41LjGYAQCgAQHIAQrAAQE&sclient=gws-wiz) in programming competitions for their speed and simplicity of implementation. + +[^tcs]: Segment trees are rarely mentioned in the theoretical computer science literature because they are relatively novel (invented ~2000), mostly don't do anything that [any other binary tree](https://en.wikipedia.org/wiki/Tree_(data_structure)) can't do, and *asymptotically* aren't faster — although, in practice, they often win by a lot in terms of speed. + +(If you already know the context, jump straight to the [last section](#wide-segment-trees) for the novelty: the *wide segment tree* that works 4 to 12 times faster than the Fenwick tree.) + +### Dynamic Prefix Sum + + + +Segment trees are cool and can do lots of different things, but in this article, we will focus on their simplest non-trivial application — *the dynamic prefix sum problem*: + +```cpp +void add(int k, int x); // react to a[k] += x (zero-based indexing) +int sum(int k); // return the sum of the first k elements (from 0 to k - 1) +``` + +As we have to support two types of queries, our optimization problem becomes multi-dimensional, and the optimal solution depends on the distribution of queries. For example, if one type of the queries were extremely rare, we would only optimize for the other, which is relatively easy to do: + +- If we only cared about the cost of *updating the array*, we would store it as it is and [calculate the sum](/hpc/simd/reduction) directly on each `sum` query. +- If we only cared about the cost of *prefix sum queries*, we would keep it ready and [re-calculate them entirely from scratch](/hpc/algorithms/prefix) on each update. + +Both of these options perform $O(1)$ work on one query type but $O(n)$ work on the other. When the query frequencies are relatively close, we can trade off some performance on one type of query for increased performance on the other. Segment trees let you do exactly that, achieving the equilibrium of $O(\log n)$ work for both queries. + +### Segment Tree Structure + +The main idea behind segment trees is this: + +- calculate the sum of the entire array and write it down somewhere; +- split the array into two halves, calculate the sum on both halves, and also write them down somewhere; +- split these halves into halves, calculate the total of four sums on them, and also write them down; +- …and so on, until we recursively reach segments of length one. + +These computed subsegment sums can be logically represented as a binary tree — which is what we call a *segment tree*: + +![A segment tree with with the nodes relevant for the sum(11) and add(10) queries highlighted](../img/segtree-path.png) + +Segment trees have some nice properties: + +- If the underlying array has $n$ elements, the segment tree has exactly $(2n - 1)$ nodes — $n$ leaves and $(n - 1)$ internal nodes — because each internal node splits a segment in two, and you only need $(n - 1)$ of them to completely split the original $[0, n-1]$ range. +- The height of the tree is $\Theta(\log n)$: on each next level starting from the root, the number of nodes roughly doubles and the size of their segments roughly halves. +- Each segment can be split into $O(\log n)$ non-intersecting segments that correspond to the nodes of the segment tree: you need at most two from each layer. + +When $n$ is not a perfect power of two, not all levels are filled entirely — the last layer may be incomplete — but the truthfulness of these properties remains unaffected. The first property allows us to use only $O(n)$ memory to store the tree, and the last two let us solve the problem in $O(\log n)$ time: + +- The `add(k, x)` query can be handled by adding the value `x` to all nodes whose segments contain the element `k`, and we've already established that there are only $O(\log n)$ of them. +- The `sum(k)` query can be answered by finding all nodes that collectively compose the `[0, k)` prefix and summing the values stored in them — and we've also established that there would be at most $O(\log n)$ of them. + +But this is still theory. As we'll see later, there are remarkably many ways one can implement this data structure. + + + +### Pointer-Based Implementation + +The most straightforward way to implement a segment tree is to store everything we need in a node explicitly: including the array segment boundaries, the sum, and the pointers to its children. + +If we were at the "Introduction to OOP" class, we would implement a segment tree recursively like this: + +```c++ +struct segtree { + int lb, rb; // the range this node is responsible for + int s = 0; // the sum of elements [lb, rb) + segtree *l = nullptr, *r = nullptr; // pointers to its children + + segtree(int lb, int rb) : lb(lb), rb(rb) { + if (lb + 1 < rb) { // if the node is not a leaf, create children + int m = (lb + rb) / 2; + l = new segtree(lb, m); + r = new segtree(m, rb); + } + } + + void add(int k, int x) { /* react to a[k] += x */ } + int sum(int k) { /* compute the sum of the first k elements */ } +}; +``` + +If we needed to build it over an existing array, we would rewrite the body of the constructor like this: + +```c++ +if (lb + 1 == rb) { + s = a[lb]; // the node is a leaf -- its sum is just the element a[lb] +} else { + int t = (lb + rb) / 2; + l = new segtree(lb, t); + r = new segtree(t, rb); + s = l->s + r->s; // we can use the sums of children that we've just calculated +} +``` + +The construction time is of no significant interest to us, so to reduce the mental burden, we will just assume that the array is zero-initialized in all future implementations. + +Now, to implement `add`, we need to descend down the tree until we reach a leaf node, adding the delta to the `s` fields: + +```c++ +void add(int k, int x) { + s += x; + if (l != nullptr) { // check whether it is a leaf node + if (k < l->rb) + l->add(k, x); + else + r->add(k, x); + } +} +``` + + + +To calculate the sum on a segment, we can check if the query covers the current segment fully or doesn't intersect with it at all — and return the result for this node right away. If neither is the case, we recursively pass the query to the children so that they figure it out themselves: + +```c++ +int sum(int lq, int rq) { + if (rb <= lq && rb <= rq) // if we're fully inside the query, return the sum + return s; + if (rq <= lb || lq >= rb) // if we don't intersect with the query, return zero + return 0; + return l->sum(lq, rq) + r->sum(lq, rq); +} +``` + +This function visits a total of $O(\log n)$ nodes because it only spawns children when a segment only partially intersects with the query, and there are at most $O(\log n)$ of such segments. + +For *prefix sums*, these checks can be simplified as the left border of the query is always zero: + +```c++ +int sum(int k) { + if (rb <= k) + return s; + if (lb >= k) + return 0; + return l->sum(k) + r->sum(k); +} +``` + +Since we have two types of queries, we also got two graphs to look at: + +![](../img/segtree-pointers.svg) + +While this object-oriented implementation is quite good in terms of software engineering practices, there are several aspects that make it terrible in terms of performance: + +- Both query implementations use [recursion](/hpc/architecture/functions) — although the `add` query can be tail-call optimized. +- Both query implementations use unpredictable [branching](/hpc/pipelining/branching), which stalls the CPU pipeline. +- The nodes store extra metadata. The structure takes $4+4+4+8+8=28$ bytes and gets padded to 32 bytes for [memory alignment](/hpc/cpu-cache/alignment) reasons, while only 4 bytes are really necessary to hold the integer sum. +- Most importantly, we are doing a lot of [pointer chasing](/hpc/cpu-cache/latency): we have to fetch the pointers to the children to descend into them, even though we can infer, ahead of time, which segments we'll need just from the query. + +Pointer chasing outweighs all other issues by orders of magnitude — and to negate it, we need to get rid of pointers, making the structure *implicit*. + +### Implicit Segment Trees + +As a segment tree is a type of binary tree, we can use the [Eytzinger layout](../binary-search#eytzinger-layout) to store its nodes in one large array and use index arithmetic instead of explicit pointers to navigate it. + +More formally, we define node $1$ to be the root, holding the sum of the entire array $[0, n)$. Then, for every node $v$ corresponding to the range $[l, r]$, we define: + +- the node $2v$ to be its left child corresponding to the range $[l, \lfloor \frac{l+r}{2} \rfloor)$; +- the node $(2v+1)$ to be its right child corresponding to the range $[\lfloor \frac{l+r}{2} \rfloor, r)$. + +When $n$ is a perfect power of two, this layout packs the entire tree very nicely: + +![The memory layout of the implicit segment tree with the same query path highlighted](../img/segtree-layout.png) + +However, when $n$ is not a power of two, the layout stops being compact: although we still have exactly $(2n - 1)$ nodes regardless of how we split segments, they are no longer mapped perfectly to the $[1, 2n)$ range. + +For example, consider what happens when we descend to the rightmost leaf in a segment tree of size $17 = 2^4 + 1$: + +- we start with the root numbered $1$ representing the range $[0, 16]$, +- we go to node $3 = 2 \times 1 + 1$ representing the range $[8, 16]$, +- we go to node $7 = 2 \times 2 + 1$ representing the range $[12, 16]$, +- we go to node $15 = 2 \times 7 + 1$ representing the range $[14, 16]$, +- we go to node $31 = 2 \times 15 + 1$ representing the range $[15, 16]$, +- and we finally reach node $63 = 2 \times 31 + 1$ representing the range $[16, 16]$. + +So, as $63 > 2 \times 17 - 1 = 33$, there are some empty spaces in the layout, but the structure of the tree is still the same, and its height is still $O(\log n)$. For now, we can ignore this problem and just allocate a larger array for storing the nodes — it can be shown that the index of the rightmost leaf never exceeds $4n$, so allocating that many cells will always suffice: + +```c++ +int t[4 * N]; // contains the node sums +``` + +Now, to implement `add`, we create a similar recursive function but using index arithmetic instead of pointers. Since we've also stopped storing the borders of the segment in the nodes, we need to re-calculate them and pass them as parameters for each recursive call: + +```c++ +void add(int k, int x, int v = 1, int l = 0, int r = N) { + t[v] += x; + if (l + 1 < r) { + int m = (l + r) / 2; + if (k < m) + add(k, x, 2 * v, l, m); + else + add(k, x, 2 * v + 1, m, r); + } +} +``` + +The implementation of the prefix sum query is largely the same: + +```c++ +int sum(int k, int v = 1, int l = 0, int r = N) { + if (l >= k) + return 0; + if (r <= k) + return t[v]; + int m = (l + r) / 2; + return sum(k, 2 * v, l, m) + + sum(k, 2 * v + 1, m, r); +} +``` + +Passing around five variables in a recursive function seems clumsy, but the performance gains are clearly worth it: + +![](../img/segtree-topdown.svg) + +Apart from requiring much less memory, which is good for fitting into the CPU caches, the main advantage of this implementation is that we can now make use of the [memory parallelism](/hpc/cpu-cache/mlp) and fetch the nodes we need in parallel, considerably improving the running time for both queries. + +To improve the performance further, we can: + +- manually optimize the index arithmetic (e.g., noticing that we need to multiply `v` by `2` either way), +- replace division by two with an explicit binary shift (because [compilers aren't always able to do it themselves](/hpc/compilation/contracts/#arithmetic)), +- and, most importantly, get rid of [recursion](/hpc/architecture/functions) and make the implementation fully iterative. + +As `add` is tail-recursive and has no return value, it is easy turn it into a single `while` loop: + +```c++ +void add(int k, int x) { + int v = 1, l = 0, r = N; + while (l + 1 < r) { + t[v] += x; + v <<= 1; + int m = (l + r) >> 1; + if (k < m) + r = m; + else + l = m, v++; + } + t[v] += x; +} +``` + +Doing the same for the `sum` query is slightly harder as it has two recursive calls. The key trick is to notice that when we make these calls, one of them is guaranteed to terminate immediately as `k` can only be in one of the halves, so we can simply check this condition before descending the tree: + +```c++ +int sum(int k) { + int v = 1, l = 0, r = N, s = 0; + while (true) { + int m = (l + r) >> 1; + v <<= 1; + if (k >= m) { + s += t[v++]; + if (k == m) + break; + l = m; + } else { + r = m; + } + } + return s; +} +``` + +This doesn't improve the performance for the update query by a lot (because it was tail-recursive, and the compiler already performed a similar optimization), but the running time on the prefix sum query has roughly halved for all problem sizes: + +![](../img/segtree-iterative.svg) + +This implementation still has some problems: we are using up to twice as much memory as necessary, we have costly [branching](/hpc/pipelining/branching), and we have to maintain and re-compute array bounds on each iteration. To get rid of these problems, we need to change our approach a little bit. + +### Bottom-Up Implementation + +Let's change the definition of the implicit segment tree layout. Instead of relying on the parent-to-child relationship, we first forcefully assign all the leaf nodes numbers in the $[n, 2n)$ range, and then recursively define the parent of node $k$ to be equal to node $\lfloor \frac{k}{2} \rfloor$. + +This structure is largely the same as before: you can still reach the root (node $1$) by dividing any node number by two, and each node still has at most two children: $2k$ and $(2k + 1)$, as anything else yields a different parent number when floor-divided by two. The advantage we get is that we've forced the last layer to be contiguous and start from $n$, so we can use the array of half the size: + +```c++ +int t[2 * N]; +``` + +When $n$ is a power of two, the structure of the tree is exactly the same as before and when implementing the queries, we can take advantage of this bottom-up approach and start from the $k$-th leaf node (simply indexed $N + k$) and ascend the tree until we reach the root: + +```c++ +void add(int k, int x) { + k += N; + while (k != 0) { + t[k] += x; + k >>= 1; + } +} +``` + +To calculate the sum on the $[l, r)$ subsegment, we can maintain pointers to the first and the last element that needs to be added, increase/decrease them respectively when we add a node and stop after they converge to the same node (which would be their least common ancestor): + +```c++ +int sum(int l, int r) { + l += N; + r += N - 1; + int s = 0; + while (l <= r) { + if ( l & 1) s += t[l++]; // l is a right child: add it and move to a cousin + if (~r & 1) s += t[r--]; // r is a left child: add it and move to a cousin + l >>= 1, r >>= 1; + } + return s; +} +``` + +Surprisingly, both queries work correctly even when $n$ is not a power of two. To understand why, consider a 13-element segment tree: + +![](../img/segtree-permuted.png) + +The first index of the last layer is always a power of two, but when the array size is not a perfect power of two, some prefix of the leaf elements gets wrapped around to the right side of the tree. Magically, this fact does not pose a problem for our implementation: + +- The `add` query still updates its parent nodes, even though some of them correspond to some prefix and some suffix of the array instead of a contiguous subsegment. +- The `sum` query still computes the sum on the correct subsegment, even when `l` is on that wrapped prefix and logically "to the right" of `r` because eventually `l` becomes the last node on a layer and gets incremented, suddenly jumping to the first element of the next layer and proceeding normally after adding just the right nodes on the wrapped-around part of the tree (look at the dimmed nodes in the illustration). + +Compared to the top-down approach, we use half the memory and don't have to maintain query ranges, which results in simpler and consequently faster code: + +![](../img/segtree-bottomup.svg) + +When running the benchmarks, we use the `sum(l, r)` procedure for computing a general subsegment sum and just fix `l` equal to `0`. To achieve higher performance on the prefix sum query, we want to avoid maintaining `l` and only move the right border like this: + +```c++ +int sum(int k) { + int s = 0; + k += N - 1; + while (k != 0) { + if (~k & 1) // if k is a right child + s += t[k--]; + k = k >> 1; + } + return s; +} +``` + +In contrast, this prefix sum implementation doesn't work unless $n$ is not a power of two — because `k` could be on that wrapped-around part, and we'd sum almost the entire array instead of a small prefix. + +To make it work for arbitrary array sizes, we can permute the leaves so that they are in the left-to-right logical order in the last two layers of the tree. In the example above, this would mean adding $3$ to all leaf indexes and then moving the last three leaves one level higher by subtracting $13$. + +In the general case, this can be done using predication in a few cycles like this: + +```c++ +const int last_layer = 1 << __lg(2 * N - 1); + +// calculate the index of the leaf k +int leaf(int k) { + k += last_layer; + k -= (k >= 2 * N) * N; + return k; +} +``` + +When implementing the queries, all we need to do is to call the `leaf` function to get the correct leaf index: + +```c++ +void add(int k, int x) { + k = leaf(k); + while (k != 0) { + t[k] += x; + k >>= 1; + } +} + +int sum(int k) { + k = leaf(k - 1); + int s = 0; + while (k != 0) { + if (~k & 1) + s += t[k--]; + k >>= 1; + } + return s; +} +``` + +The last touch: by replacing the `s += t[k--]` line with [predication](/hpc/pipelining/branchless), we can make the implementation branchless (except for the last branch — we still need to check the loop condition): + +```c++ +int sum(int k) { + k = leaf(k - 1); + int s = 0; + while (k != 0) { + s += (~k & 1) ? t[k] : 0; // will be replaced with a cmov + k = (k - 1) >> 1; + } + return s; +} +``` + +When combined, these optimizations make the prefix sum queries run much faster: + +![](../img/segtree-branchless.svg) + +Notice that the bump in the latency for the prefix sum query starts at $2^{19}$ and not at $2^{20}$, the L3 cache boundary. This is because we are still storing $2n$ integers and also fetching the `t[k]` element regardless of whether we will add it to `s` or not. We can actually solve both of these problems. + +### Fenwick trees + +Implicit structures are great: they avoid pointer chasing, allow visiting all the relevant nodes in parallel, and take less space as they don't store metadata in nodes. Even better than implicit structures are *succinct* structures: they only require the information-theoretical minimum space to store the structure, using only $O(1)$ additional memory. + +To make a segment tree succinct, we need to look at the values stored in the nodes and search for redundancies — the values that can be inferred from others — and remove them. One way to do this is to notice that in every implementation of prefix sum, we've never used the sums stored in right children — therefore, for computing prefix sums, such nodes are redundant: + + + +![](../img/segtree-succinct.png) + +*The Fenwick tree* (also called *binary indexed tree* — soon you'll understand why) is a type of segment tree that uses this consideration and gets rid of all *right* children, essentially removing every second node in each layer and making the total node count the same as the underlying array. + +```c++ +int t[N + 1]; // +1 because we use use one-based indexing +``` + +To store these segment sums compactly, the Fenwick tree ditches the Eytzinger layout: instead, in place of every element $k$ that would be a leaf in the last layer of a segment tree, it stores the sum of its first non-removed ancestor. For example: + +- the element $7$ would hold the sum on the $[0, 7]$ range ($282$), +- the element $9$ would hold the sum on the $[8, 9]$ range ($-86$), +- the element $10$ would hold the sum on the $[10, 10]$ range ($-52$, the element itself). + +How to compute this range for a given element $k$ (the left boundary, to be more specific: the right boundary is always the element $k$ itself) quicker than simulating the descend down the tree? Turns out, there is a smart bit trick that works when the tree size is a power of two and we use one-based indexing — just remove the least significant bit of the index: + +- the left bound for element $7 + 1 = 8 = 1000_2$ is $0000_2 = 0$, +- the left bound for element $9 + 1 = 10 = 1010_2$ is $1000_2 = 8$, +- the left bound for element $10 + 1 = 11 = 1011_2$ is $1010_2 = 10$. + +And to get the last set bit of an integer, we can use this procedure: + +```c++ +int lowbit(int x) { + return x & -x; +} +``` + +This trick works by the virtue of how signed numbers are stored in binary using [two's complement](/hpc/arithmetic/integer). When we compute `-x`, we implicitly subtract it from a large power of two: some prefix of the number flips, some suffix of zeros at the end remains, and the only one-bit that stays unchanged is the last set bit — which will be the only one surviving `x & -x`. For example: + +``` ++90 = 64 + 16 + 8 + 2 = (0)10110 +-90 = 00000 - 10110 = (1)01010 + → (+90) & (-90) = (0)00010 +``` + + + +We've established what a Fenwick tree is just an array of size `n` where each element `k` is defined to be the sum of elements from `k - lowbit(k) + 1` and `k` inclusive in the original array, and now it's time to implement some queries. + +Implementing the prefix sum query is easy. The `t[k]` holds the sum we need except for the first `k - lowbit(k)` elements, so we can just add it to the result and then jump to `k - lowbit(k)` and continue doing this until we reach the beginning of the array: + +```c++ +int sum(int k) { + int s = 0; + for (; k != 0; k -= lowbit(k)) + s += t[k]; + return s; +} +``` + + + +Since we are repeatedly removing the lowest set bit from `k`, and also since this procedure is equivalent to visiting the same left-child nodes in a segment tree, each `sum` query can touch at most $O(\log n)$ nodes: + +![A path for a prefix sum query in a Fenwick tree](../img/fenwick-sum.png) + +To slightly improve the performance of the `sum` query, we use `k &= k - 1` to remove the lowest bit in one go, which is one instruction faster than `k -= k & -k`: + +```c++ +int sum(int k) { + int s = 0; + for (; k != 0; k &= k - 1) + s += t[k]; + return s; +} +``` + +Unlike all previous segment tree implementations, a Fenwick tree is a structure where it is easier and more efficient to calculate the sum on a subsegment as the difference of two prefix sums: + +```c++ +// [l, r) +int sum (int l, int r) { + return sum(r) - sum(l); +} +``` + +The update query is easier to code but less intuitive. We need to add a value `x` to all nodes that are left-child ancestors of leaf `k`. Such nodes have indices `m` larger than `k` but `m - lowbit(m) < k` so that `k` is included in their ranges. + +All such indices need to have a common prefix with `k`, then a `1` where it was `0` in `k`, and then a suffix of zeros so that that `1` canceled and the result of `m - lowbit(m)` is less than `k`. All such indices can be generated iteratively like this: + +```c++ +void add(int k, int x) { + for (k += 1; k <= N; k += k & -k) + t[k] += x; +} +``` + +Repeatedly adding the lowest set bit to `k` makes it "more even" and lifts it to its next left-child segment tree ancestor: + +![A path for an update query in a Fenwick tree](../img/fenwick-update.png) + +Now, if we leave all the code as it is, it works correctly even when $n$ is not a power of two. In this case, the Fenwick tree is not equivalent to a segment tree of size $n$ but to a *forest* of up to $O(\log n)$ segment trees of power-of-two sizes — or to a single segment tree padded with zeros to a large power of two, if you like to think this way. In either case, all procedures still work correctly as they never touch anything outside the $[1, n]$ range. + + + +The performance of the Fenwick tree is similar to the optimized bottom-up segment tree for the update queries and slightly faster for the prefix sum queries: + +![](../img/segtree-fenwick.svg) + +There is one weird thing on the graph. After we cross the L3 cache boundary, the performance takes off very rapidly. This is a [cache associativity](/hpc/cpu-cache/associativity) effect: the most frequently used cells all have their indices divisible by large powers of two, so they get aliased to the same cache set, kicking each other out and effectively reducing the cache size. + +One way to negate this effect is to insert "holes" in the layout like this: + +```c++ +inline constexpr int hole(int k) { + return k + (k >> 10); +} + +int t[hole(N) + 1]; + +void add(int k, int x) { + for (k += 1; k <= N; k += k & -k) + t[hole(k)] += x; +} + +int sum(int k) { + int res = 0; + for (; k != 0; k &= k - 1) + res += t[hole(k)]; + return res; +} +``` + +Computing the `hole` function is not on the critical path between iterations, so it does not introduce any significant overhead but completely removes the cache associativity problem and shrinks the latency by up to 3x on large arrays: + +![](../img/segtree-fenwick-holes.svg) + +Fenwick trees are fast, but there are still other minor issues with them. Similar to [binary search](../binary-search), the temporal locality of their memory accesses is not the greatest, as rarely accessed elements are grouped with the most frequently accessed ones. Fenwick trees also execute a non-constant number of iterations and have to perform end-of-loop checks, very likely causing a branch misprediction — although just a single one. + +There are probably still some things to optimize, but we are going to leave it there and focus on an entirely different approach, and if you know [S-trees](../s-tree), you probably already know where this is headed. + +### Wide Segment Trees + +Here is the main idea: if the memory system is fetching a full [cache line](/hpc/cpu-cache/cache-lines) for us anyway, let's fill it to the maximum with information that lets us process the query quicker. For segment trees, this means storing more than one data point in a node. This lets us reduce the tree height and perform fewer iterations when descending or ascending it: + +![](../img/segtree-wide.png) + +We will use the term *wide (B-ary) segment tree* to refer to this modification. + +To implement this layout, we can use a similar [constexpr](/hpc/compilation/precalc)-based approach we used in [S+ trees](../s-tree#implicit-b-tree-1): + +```c++ +const int b = 4, B = (1 << b); // cache line size (in integers, not bytes) + +// the height of the tree over an n-element array +constexpr int height(int n) { + return (n <= B ? 1 : height(n / B) + 1); +} + +// where the h-th layer starts +constexpr int offset(int h) { + int s = 0, n = N; + while (h--) { + n = (n + B - 1) / B; + s += n * B; + } + return s; +} + +constexpr int H = height(N); +alignas(64) int t[offset(H)]; // an array for storing nodes +``` + +This way, we effectively reduce the height of the tree by approximately $\frac{\log_B n}{\log_2 n} = \log_2 B$ times ($\sim4$ times if $B = 16$), but it becomes non-trivial to implement in-node operations efficiently. For our problem, we have two main options: + +1. We could store $B$ *sums* in each node (for each of its $B$ children). +2. We could store $B$ *prefix sums* in each node (the $i$-th being the sum of the first $(i + 1)$ children). + +If we go with the first option, the `add` query would be largely the same as in the bottom-up segment tree, but the `sum` query would need to add up to $B$ scalars in each node it visits. And if we go with the second option, the `sum` query would be trivial, but the `add` query would need to add `x` to some suffix on each node it visits. + +In either case, one operation would perform $O(\log_B n)$ operations, touching just one scalar in each node, while the other would perform $O(B \cdot \log_B n)$ operations, touching up to $B$ scalars in each node. We can, however, use [SIMD](/hpc/simd) to accelerate the slower operation, and since there are no fast [horizontal reductions](/hpc/simd/reduction) in SIMD instruction sets, but it is easy to add a vector to a vector, we will choose the second approach and store prefix sums in each node. + +This makes the `sum` query extremely fast and easy to implement: + +```c++ +int sum(int k) { + int s = 0; + for (int h = 0; h < H; h++) + s += t[offset(h) + (k >> (h * b))]; + return s; +} +``` + +The `add` query is more complicated and slower. We need to add a number only to a suffix of a node, and we can do this by [masking out](/hpc/simd/masking) the positions that should not be modified. + +We can pre-calculate a $B \times B$ array corresponding to $B$ such masks that tell, for each of $B$ positions within a node, whether a certain prefix sum value needs to be updated or not: + +```c++ +struct Precalc { + alignas(64) int mask[B][B]; + + constexpr Precalc() : mask{} { + for (int k = 0; k < B; k++) + for (int i = 0; i < B; i++) + mask[k][i] = (i > k ? -1 : 0); + } +}; + +constexpr Precalc T; +``` + +Apart from this masking trick, the rest of the computation is simple enough to be handled with [GCC vector types](/hpc/simd/intrinsics#gcc-vector-extensions) only. When processing the `add` query, we just use these masks to bitwise-and them with the broadcasted `x` value to mask it and then add it to the values stored in the node: + +```c++ +typedef int vec __attribute__ (( vector_size(32) )); + +constexpr int round(int k) { + return k & ~(B - 1); // = k / B * B +} + +void add(int k, int x) { + vec v = x + vec{}; + for (int h = 0; h < H; h++) { + auto a = (vec*) &t[offset(h) + round(k)]; + auto m = (vec*) T.mask[k % B]; + for (int i = 0; i < B / 8; i++) + a[i] += v & m[i]; + k >>= b; + } +} +``` + +This speeds up the `sum` query by more than 10x and the `add` query by up to 4x compared to the Fenwick tree: + +![](../img/segtree-simd.svg) + +Unlike [S-trees](../s-tree), the block size can be easily changed in this implementation (by literally changing one character). Expectedly, when we increase it, the update time also increases as we need to fetch more cache lines and process them, but the `sum` query time decreases as the height of the tree becomes smaller: + +![](../img/segtree-simd-others.svg) + +Similar to the [S+ trees](../s-tree/#modifications-and-further-optimizations), the optimal memory layout probably has non-uniform block sizes, depending on the problem size and the distribution of queries, but we are not going to explore this idea and just leave the optimization here. + + + +### Comparisons + +Wide segment trees are significantly faster compared to other popular segment tree implementations: + +![](../img/segtree-popular.svg) + +The relative speedup is in the orders of magnitude: + +![](../img/segtree-popular-relative.svg) + +Compared to the original pointer-based implementation, the wide segment tree is up to 200 and 40 times faster for the prefix sum and update queries, respectively — although, for sufficiently large arrays, both implementations become purely memory-bound, and this speedup goes down to around 60 and 15 respectively. + +### Modifications + +We have only focused on the prefix sum problem for 32-bit integers — to make this already long article slightly less long and also to make the comparison with the Fenwick tree fair — but wide segment trees can be used for other common range operations, although implementing them efficiently with SIMD requires some creativity. + +*Disclaimer:* I haven't implemented any of these ideas, so some of them may be fatally flawed. + +**Other data types** can be trivially supported by changing the vector type and, if they differ in size, the node size $B$ — which also changes the tree height and hence the total number of iterations for both queries. + +It may also be that the queries have different limits on the updates and the prefix sum queries. For example, it is not uncommon to have only "$\pm 1$" update queries with a guarantee that the result of the prefix sum query always fits into a 32-bit integer. If the result could fit into 8 bits, we'd simply use a 8-bit `char` with block size of $B=64$ bytes, making the total tree height $\frac{\log_{16} n}{\log_{64} n} = \log_{16} 64 = 1.5$ times smaller and both queries proportionally faster. + +Unfortunately, that doesn't work in the general case, but we still have a way to speed up queries when the update deltas are small: we can *buffer* the updates queries. Using the same "$\pm 1$" example, we can make the branching factor $B=64$ as we wanted, and in each node, we store $B$ 32-bit integers, $B$ 8-bit signed chars, and a single 8-bit counter variable that starts at $127$ and decrements each time we update a node. Then, when we process the queries in nodes: + +- For the update query, we add a vector of masked 8-bit plus-or-minus ones to the `char` array, decrement the counter, and, if it is zero, [convert](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=3037,3009,4870,6715,4845,3853,288,6570,90,7307,5993,2692,6946,6949,5456,6938,5456,1021,3007,514,518,7253,7183,3892,5135,5260,3915,4027,3873,7401,4376,4229,151,2324,2310,2324,591,4075,3011,3009,6130,4875,6385,5259,6385,6250,1395,7253,6452,7492,4669,4669,7253,1039,1029,4669,4707,7253,7242,848,879,848,7251,4275,879,874,849,833,6046,7250,4870,4872,4875,849,849,5144,4875,4787,4787,4787,3016,3018,5227,7359,7335,7392,4787,5259,5230,5230,5223,5214,6438,5229,488,483,6527,6527,6554,1829,1829,1829&techs=AVX,AVX2&text=cvtepi8_) the values in the `char` array to 32-bit integers, add them to the integer array, set the `char` array to zero, and reset the counter back to 127. +- For the prefix sum query, we visit the same nodes but add *both* `int` and `char` values to the result. + +This update accumulation trick lets us increase the performance by up to 1.5x at the cost of using ~25% more memory. + +Having a conditional branch in the `add` query and adding the `char` array to the `int` array is rather slow, but since we only have to do it every 127 iterations, it doesn't cost us anything in the amortized sense. The processing time for the `sum` query increases, but not significantly — because it mostly depends on the slowest read rather than the number of iterations. + +**General range queries** can be supported the same way as in the Fenwick tree: just decompose the range $[l, r)$ as the difference of two prefix sums $[0, r)$ and $[0, l)$. + +This also works for some operations other than addition (multiplication modulo prime, xor, etc.), although they have to be *reversible:* there should be a way to quickly "cancel" the operation on the left prefix from the final result. + +**Non-reversible operations** can also be supported, although they should still satisfy some other properties: + +- They must be *associative:* $(a \circ b) \circ c = a \circ (b \circ c)$. +- They must have an *identity element:* $a \circ e = e \circ a = a$. + +(Such algebraic structures are called [monoids](https://en.wikipedia.org/wiki/Monoid) if you're a snob.) + +Unfortunately, the prefix sum trick doesn't work when the operation is not reversible, so we have to switch to [option one](#wide-segment-trees) and store the results of these operations separately for each segment. This requires some significant changes to the queries: + +- The update query should replace one scalar at the leaf, perform a [horizontal reduction](/hpc/simd/reduction/#horizontal-summation) at the leaf node, and then continue upwards, replacing one scalar of its parent and so on. +- The range reduction query should, separately for left and right borders, calculate a vector with vertically reduced values on their paths, combine these two vectors into one, and then reduce it horizontally to return the final answer. Note that we still need to use masking to replace values outside of query with neutral elements, and this time, it probably requires some conditional moves/blending and either $B \times B$ precomputed masks or using two masks to account for both left and right borders of the query. + +This makes both queries much slower — especially the reduction — but this should still be faster compared to the bottom-up segment tree. + +**Minimum** is a nice exception where the update query can be made slightly faster if the new value of the element is less than the current one: we can skip the horizontal reduction part and just update $\log_B n$ nodes using a scalar procedure. + +This works very fast when we mostly have such updates, which is the case, e.g., for the sparse-graph Dijkstra algorithm when we have more edges than vertices. For this problem, the wide segment tree can serve as an efficient fixed-universe min-heap. + +**Lazy propagation** can be done by storing a separate array for the delayed operations in a node. To propagate the updates, we need to go top to bottom (which can be done by simply reversing the direction of the `for` loop and using `k >> (h * b)` to calculate the `h`-th ancestor), [broadcast](/hpc/simd/moving/#broadcast) and reset the delayed operation value stored in the parent of the current node, and apply it to all values stored in the current node with SIMD. + +One minor problem is that for some operations, we need to know the lengths of the segments: for example, when we need to support a sum and a mass assignment. It can be solved by either padding the elements so that each segment on a layer is uniform in size, pre-calculating the segment lengths and storing them in the node, or using predication to check for the problematic nodes (there will be at most one on each layer). + + + +### Acknowledgements + +Many thanks to Giulio Ermanno Pibiri for collaborating on this case study, which is largely based on his 2020 paper "[Practical Trade-Offs for the Prefix-Sum Problem](https://arxiv.org/pdf/2006.14552.pdf)" co-authored with Rossano Venturini. I highly recommend reading the original article if you are interested in the details we've skipped through here for brevity. + + + +The code and some ideas regarding bottom-up segment trees were adapted from a 2015 blog post "[Efficient and easy segment trees](https://codeforces.com/blog/entry/18051)" by Oleksandr Bacherikov. diff --git a/content/english/hpc/data-structures/segment.md b/content/english/hpc/data-structures/segment.md deleted file mode 100644 index 92c2afa0..00000000 --- a/content/english/hpc/data-structures/segment.md +++ /dev/null @@ -1,214 +0,0 @@ ---- -title: Segment Trees -weight: 2 -draft: true ---- - -The lessons we learned from studying layouts for binary search can be applied to broader range of data structures. - -Most of examples in this section are about optimizing some algorithms that are either included in standard library or take under 10 lines of code to implement naively, but we will start off with a bit more obscure example. - -Segment tree is a data structure that stores information about array segments. It is a static tree of degree two, and here is what this means: - -Segment trees are used for windowing queries or range queries in general, either by themselves or as part of a larger algorithm. They are very rarely mentioned in scientific literature, because they are relatively novel (invented around 2000), and *asymptotically* they don't do anything that any other binary tree can't, but they are dominant structure in the world of competitive programming because of their performance and ease of implementation. - -Segment trees are built recursively: build a tree for left and right halves and merge results to get root. - -```cpp -void add(int k, int x); // 0-based indexation -int sum(int k); // sum of elements indexed [0, k] -``` - -## Segment Trees - -* Static tree data structure used for storing information about array segments -* Popular in competitive programming, very rarely used in real life -* -* Many different implementations possible - -![](https://i.stack.imgur.com/xeIcl.png) - ----- - -### Pointer-Based - -* Actually really good in terms of SWE practices, but terrible in terms of performance -* Pointer chasing, 4 unnecessary metadata fields, recursion, branching - -```cpp -struct segtree { - int lb, rb; - int s = 0; - segtree *l = 0, *r = 0; - - segtree(int lb, int rb) : lb(lb), rb(rb) { - if (lb + 1 < rb) { - int t = (lb + rb) / 2; - l = new segtree(lb, t); - r = new segtree(t, rb); - } - } - - void add(int k, int x) { - s += x; - if (l) { - if (k < l->rb) - l->add(k, x); - else - r->add(k, x); - } - } - - int sum(int k) { // [0, k) - if (rb <= k) - return s; - if (lb >= k) - return 0; - return l->sum(k) + r->sum(k); - } -}; -``` - ----- - -### Implicit (Recursive) - -* Eytzinger-like layout: $2k$ is the left child and $2k+1$ is the right child -* Wasted memory, recursion, branching - -```cpp -int t[4 * N]; - -void _add(int k, int x, int v = 1, int l = 0, int r = N) { - t[v] += x; - if (l + 1 < r) { - int m = (l + r) / 2; - if (k < m) - _add(k, x, 2 * v, l, m); - else - _add(k, x, 2 * v + 1, m, r); - } -} - -int _sum(int k, int v = 1, int l = 0, int r = N) { - if (l > k) - return 0; - if (r - 1 <= k) - return t[v]; - int m = (l + r) / 2; - return _sum(k, 2 * v, l, m) - + _sum(k, 2 * v + 1, m, r); -} -``` - -### Implicit (Iterative) - -```cpp -void add(int k, int x) { - int v = 1, l = 0, r = N; - while (l + 1 < r) { - t[v] += x; - int m = (l + r) / 2; - if (k < m) - v = 2 * v, r = m; - else - v = 2 * v + 1, l = m; - } - t[v] += x; -} - -int sum(int k) { - if (k == N - 1) - return t[1]; - int v = 1, l = 0, r = n; - int s = 0; - while (l < r) { - int m = (l + r) / 2; - v *= 2; - if (k < m) { - if (k == m - 1) - return s + t[v]; - r = m; - } else { - s += t[v]; - v++; - l = m; - } - } - return s; -} -``` -### Implicit (Bottom-up) - -* Different layout: leaf nodes are numbered $n$ to $(2n - 1)$, "parent" is $\lfloor k/2 \rfloor$ -* Minimum possible amount of memory -* Fully iterative and no branching (pipelinize-able reads!) - -```cpp -int n, t[2*maxn]; - -void build() { - for (int i = n-1; i > 0; i--) - t[i] = max(t[i<<1], t[i<<1|1]); -} - -void upd(int k, int x) { - k += n; - t[k] = x; - while (k > 1) { - t[k>>1] = max(t[k], t[k^1]); - k >>= 1; - } -} - -int rmq(int l, int r) { - int ans = 0; - l += n, r += n; - while (l <= r) { - if (l&1) ans = max(ans, t[l++]); - if (!(r&1)) ans = max(ans, t[r--]); - l >>= 1, r >>= 1; - } - return ans; -} -``` - -https://codeforces.com/blog/entry/18051 - ---- - -## Fenwick trees - -* Structure used to calculate prefix sums and similar operations -* Defined as array $t_i = \sum_{k=f(i)}^i a_k$ where $f$ is any function for which $f(i) \leq i$ -* If $f$ is "remove last bit" (`x -= x & -x`), - then both query and update would only require updating $O(\log n)$ different $t$'s - -```cpp -int t[maxn]; - -// calculate sum on prefix: -int sum(int r) { - int res = 0; - for (; r > 0; r -= r & -r) - res += t[r]; - return res; -} - -// how you can use it to calculate sums on subsegments: -int sum (int l, int r) { - return sum(r) - sum(l-1); -} - -// updates necessary t's: -void add(int k, int x) { - for (; k <= n; k += k & -k) - t[k] += x; -} -``` - -Can't be more optimal because of pipelining and implicit prefetching - -## Further Reading - -This article is loosely based on "[Practical Trade-Offs for the Prefix-Sum Problem](https://arxiv.org/pdf/2006.14552.pdf)" by Giulio Ermanno Pibiri and Rossano Venturini. diff --git a/content/english/hpc/external-memory/_index.md b/content/english/hpc/external-memory/_index.md index 11fb6a4a..0af587b3 100644 --- a/content/english/hpc/external-memory/_index.md +++ b/content/english/hpc/external-memory/_index.md @@ -3,15 +3,9 @@ title: External Memory weight: 8 --- -If a CPU core has a frequency of 3 GHz, it roughly means that it is capable of executing up to $3 \cdot 10^9$ operations per second, depending on what constitutes an "operation". This is the baseline: on modern architectures, it can be increased by techniques such as SIMD and instruction-level parallelism up to $10^{11}$ operations per second, if the computation allows it. +How long does it take to add two numbers together? Being one of the most frequently used instructions, `add` by itself only takes one cycle to execute. So, if the data is already loaded into registers, it takes one just cycle. -But for many algorithms, the CPU is not the bottleneck. Before trying to optimize performance above that baseline, we need to learn not to drop below it, and the number one reason for this is memory. - -## A + B - -To illustrate this point, consider this: how long does it take to add two numbers together? The only correct answer to this question is "it depends" — mainly on where the operands are stored. - -Being one of the most frequently used instructions, `add` by itself only takes one cycle to execute. So if the data is already in registers, it takes one cycle. In general case (`*c = *a + *b`), it needs to fetch its operands from memory first: +But in the general case (`*c = *a + *b`), we need to fetch its operands from memory first: ```nasm mov eax, DWORD PTR [rsi] @@ -19,6 +13,36 @@ add eax, DWORD PTR [rdi] mov DWORD PTR [rdx], eax ``` -Typically, the data is stored in the main memory (RAM), and it will take around ~40ns, or about 100 cycles, to fetch it, and then another 100 cycles to write it back. If it was accessed recently, it is probably *cached* and will take less than that to fetch, depending on how long ago it was accessed — it could be ~20ns for the slowest layer of cache and under 1ns for the fastest. But it could also be that the data is stored on the hard drive, and in this case it will take around 5ms, or roughly $10^7$ cycles (!), to access it. + + +When you fetch anything from memory, there is always some latency before the data arrives. Moreover, the request doesn't go directly to its ultimate storage location, but it first goes through a complex system of address translation units and caching layers designed to both help in memory management and reduce latency. + +Therefore, the only correct answer to this question is "it depends" — primarily on where the operands are stored: + +- If the data is stored in the main memory (RAM), it will take around ~100ns, or about 200 cycles, to fetch it, and then another 200 cycles to write it back. +- If it was accessed recently, it is probably *cached* and will take less than that to fetch, depending on how long ago it was accessed — it could be ~50 cycles for the slowest layer of cache and around 4-5 cycles for the fastest. +- But it could also be stored on some type of *external memory* such as a hard drive, and in this case, it will take around 5ms, or roughly $10^7$ cycles (!) to access it. + +Such a high variance of memory performance is caused by the fact that memory hardware doesn't follow the same [laws of silicon scaling](/hpc/complexity/hardware) as CPU chips do. Memory is still improving through other means, but if 50 years ago memory timings were roughly on the same scale with the instruction latencies, nowadays they lag far behind. + +![](img/memory-vs-compute.png) + +To be less of a limiting factor, modern memory systems are becoming increasingly [hierarchical](hierarchy), where the higher layers trade off some of their capacity for reduced latency. As these characteristics may change in the orders of magnitude between the layers — especially in the case of external memory types — it became crucial for many memory-intensive algorithms to optimize their I/O operations before anything else. + +This prompted the creation of a new cost model, called the *external memory model*, whose only primitive operations are block reads and writes, and everything else has zero cost as long as it only involves data stored in a limited-sized local memory. It spawned an exciting new field of *external memory algorithms*, which we will study in this chapter. + + diff --git a/content/english/hpc/external-memory/hierarchy.md b/content/english/hpc/external-memory/hierarchy.md index 249457cc..26dfc144 100644 --- a/content/english/hpc/external-memory/hierarchy.md +++ b/content/english/hpc/external-memory/hierarchy.md @@ -3,9 +3,7 @@ title: Memory Hierarchy weight: 1 --- -## Memory Hierarchy - -Modern computer memory is hierarchical. It consists of multiple *cache layers* of varying speed and size, where *upper* levels typically store most frequently accessed data from *lower* levels to reduce latency. Each new level is usually an order of magnitude faster, but also smaller and/or more expensive. +Modern computer memory is highly hierarchical. It consists of multiple *cache layers* of varying speed and size, where *higher* levels typically store most frequently accessed data from *lower* levels to reduce latency: each next level is usually an order of magnitude faster, but also smaller and/or more expensive. ![](../img/hierarchy.png) @@ -16,14 +14,14 @@ From this perspective, each type of memory has a few important characteristics: - *total size* $M$; - *block size* $B$; - *latency*, that is, how much time it takes to fetch one byte; -- *bandwidth*, which may be higher than just the block size times latency, meaning that IO operations can "overlap"; -- *cost* in the amortized sense, including the price for chip, its energy requirements, maintenance and so on. +- *bandwidth*, which may be higher than just the block size times latency, meaning that I/O operations can "overlap"; +- *cost* in the amortized sense, including the price for the chip, its energy requirements, maintenance, and so on. Here is an approximate comparison table for commodity hardware in 2021: | Type | $M$ | $B$ | Latency | Bandwidth | $/GB/mo[^pricing] | |:-----|:---------|-----|---------|-----------|:------------------| -| L1 | 10K | 64B | 0.5ns | 80G/s | - | +| L1 | 10K | 64B | 2ns | 80G/s | - | | L2 | 100K | 64B | 5ns | 40G/s | - | | L3 | 1M/core | 64B | 20ns | 20G/s | - | | RAM | GBs | 64B | 100ns | 10G/s | 1.5 | @@ -31,41 +29,45 @@ Here is an approximate comparison table for commodity hardware in 2021: | HDD | TBs | - | 10ms | 1G/s | 0.04 | | S3 | $\infty$ | - | 150ms | $\infty$ | 0.02[^S3] | -Of course, in reality there are many specifics about each type of memory, which we will now go through. +In reality, there are many specifics about each type of memory, which we will now go through. -[^pricing]: Pricing information is taken from Google Cloud Platform. -[^S3]: Cloud storage typically has multiple tiers, becoming progressively cheaper if you access the data less frequently. +[^pricing]: Pricing information is taken from the [Google Cloud Platform](https://cloud.google.com/products/calculator?skip_cache=true). +[^S3]: Cloud storage typically has [multiple tiers](https://aws.amazon.com/s3/storage-classes/), becoming progressively cheaper if you access the data less frequently. ### Volatile Memory -Everything up to the RAM level is called *volatile memory*, because it does not persist data in case of a power shortage and other disasters. It is fast, which is why it is used to store temporary data while the computer is powered. +Everything up to the RAM level is called *volatile memory* because it does not persist data in case of a power shortage and other disasters. It is fast, which is why it is used to store temporary data while the computer is powered. From fastest to slowest: -- **CPU registers**, which are the zero-time access data cells CPU uses to store all its intermediate values, can also be thought of as a memory type. There is only a very limited number of them (e. g. 16 "general purpose" ones), and in some cases you may want to use all of them for performance reasons. -- **CPU caches.** Modern CPUs have multiple layers of cache (L1, L2, often L3, and rarely even L4). The lowest layer is shared between cores and is usually scaled with the their number (e. g. a 10-core CPU should have around 10M of L3 cache). +- **CPU registers**, which are the zero-time access data cells CPU uses to store all its intermediate values, can also be thought of as a memory type. There is only a limited number of them (e.g., just 16 "general purpose" ones), and in some cases, you may want to use all of them for performance reasons. +- **CPU caches.** Modern CPUs have multiple layers of cache (L1, L2, often L3, and rarely even L4). The lowest layer is shared between cores and is usually scaled with their number (e.g., a 10-core CPU should have around 10M of L3 cache). - **Random access memory,** which is the first scalable type of memory: nowadays you can rent machines with half a terabyte of RAM on the public clouds. This is the one where most of your working data is supposed to be stored. The CPU cache system has an important concept of a *cache line*, which is the basic unit of data transfer between the CPU and the RAM. The size of a cache line is 64 bytes on most architectures, meaning that all main memory is divided into blocks of 64 bytes, and whenever you request (read or write) a single byte, you are also fetching all its 63 cache line neighbors whether your want them or not. -Caching on the CPU level happens automatically based on the last access times of cache lines. When accessed, the contents of a cache line are emplaced onto the lowest cache layer, and then gradually evicted to a higher levels unless accessed again in time. The programmer can't control this process explicitly, but it is worthwhile to study how it works in detail, which we will do [later](cpu-cache) in this chapter. +Caching on the CPU level happens automatically based on the last access times of cache lines. When accessed, the contents of a cache line are emplaced onto the lowest cache layer and then gradually evicted to higher levels unless accessed again in time. The programmer can't control this process explicitly, but it is worthwhile to study how it works in detail, which we will do [in the next chapter](/hpc/cpu-cache). + + + ### Non-Volatile Memory -While the data cells in CPU caches and the RAM only gently store just a few electrons (that periodically leak and need to be periodically refreshed), the data cells in *non-volatile memory* types store hundreds of them. This lets the data to be persisted for prolonged periods of time without power, but comes at the cost of performance and durability — because when you have more electrons, you also have more opportunities for them colliding with silicon atoms. +While the data cells in CPU caches and the RAM only gently store just a few electrons (that periodically leak and need to be periodically refreshed), the data cells in *non-volatile memory* types store hundreds of them. This lets the data persist for prolonged periods of time without power but comes at the cost of performance and durability — because when you have more electrons, you also have more opportunities for them to collide with silicon atoms. There are many ways to store data in a persistent way, but these are the main ones from a programmer's perspective: -- **Solid state drives.** These have relatively low latency on the order of 0.1ms ($10^5$ ns), but they also have high cost, amplified by the fact that they have limited lifespans as each cell can only be written to a limited number of times. This is what mobile devices and most laptops use, because they are compact and have no moving parts. +- **Solid state drives.** These have relatively low latency on the order of 0.1ms ($10^5$ ns), but they also have a high cost, amplified by the fact that they have limited lifespans as each cell can only be written to a limited number of times. This is what mobile devices and most laptops use because they are compact and have no moving parts. - **Hard disk drives** are unusual because they are actually [rotating physical disks](https://www.youtube.com/watch?v=3owqvmMf6No&feature=emb_title) with a read/write head attached to them. To read a memory location, you need to wait until the disk rotates to the right position and then very precisely move the head to it. This results in some very weird access patterns where reading one byte randomly may take the same time as reading the next 1MB of data — which is usually on the order of milliseconds. Since this is the only part of a computer, except for the cooling system, that has mechanically moving parts, hard disks break quite often (with the average lifespan of ~3 years for a data center HDD). -- **Network-attached storage**, which is the practice of using other networked devices to store data on them. There are two distinctive types. The first one is the Network File System (NFS), which is a protocol for mounting other computer's file system over the network. The other is API-based distributed storage systems, most famously [Amazon S3](https://aws.amazon.com/s3/), that are backed by a fleet of storage-optimized machines of a public cloud, typically using cheap HDDs or some [more exotic](https://aws.amazon.com/storagegateway/vtl/) storage types internally. While NFS can can sometimes work even faster than HDD if located in the same data center, object storage in the public cloud usually has latencies of 50-100ms. They are typically highly distributed and replicated for better availability. +- **Network-attached storage**, which is the practice of using other networked devices to store data on them. There are two distinctive types. The first one is the Network File System (NFS), which is a protocol for mounting the file system of another computer over the network. The other is API-based distributed storage systems, most famously [Amazon S3](https://aws.amazon.com/s3/), that are backed by a fleet of storage-optimized machines of a public cloud, typically using cheap HDDs or some [more exotic](https://aws.amazon.com/storagegateway/vtl/) storage types internally. While NFS can sometimes work even faster than HDD if it is located in the same data center, object storage in the public cloud usually has latencies of 50-100ms. They are typically highly distributed and replicated for better availability. Since SDD/HDD are noticeably slower than RAM, everything on or below this level is usually called *external memory*. -Unlike the CPU caches, external memory can be explicitly controlled. This is useful in many cases, but most programmers just want to abstract away from it and use it as an extension of the main memory, and operating systems have the capability to do so by the virtue of *memory paging*. +Unlike the CPU caches, external memory can be explicitly controlled. This is useful in many cases, but most programmers just want to abstract away from it and use it as an extension of the main memory, and operating systems have the capability to do so by the means of [virtual memory](../virtual). diff --git a/content/english/hpc/external-memory/img/memory-vs-compute.png b/content/english/hpc/external-memory/img/memory-vs-compute.png new file mode 100644 index 00000000..6f556440 Binary files /dev/null and b/content/english/hpc/external-memory/img/memory-vs-compute.png differ diff --git a/content/english/hpc/external-memory/list-ranking.md b/content/english/hpc/external-memory/list-ranking.md index 033a65e6..6d7c0053 100644 --- a/content/english/hpc/external-memory/list-ranking.md +++ b/content/english/hpc/external-memory/list-ranking.md @@ -3,29 +3,26 @@ title: List Ranking weight: 5 --- +In this section, we will apply [external sorting](../sorting) and [joining](../sorting#joining) to solve a problem that seems useless on the surface but is actually a key primitive used in a large number of external memory and parallel algorithms. -## List Ranking +**Problem.** Given a singly-linked list, compute the *rank* of each element, equal to its distance from the *last* element. -Now we are going to use external sorting and joining to solve a problem that seems useless, but is actually a very important primitive many graph algorithms in external memory as well as in parallel computing, so bear with me. +![Example input and output for the list ranking problem](../img/list-ranking.png) -**Problem.** Given a linked list, compute *rank* of each element, equal to its distance from the front element. - -![](../img/list-ranking.png) - -The problem is easily solvable in RAM model, but it is nontrivial how to solve this in external memory. Since our data is stored so chaotically, we can't simply traverse the list by querying each new element. +This problem can be trivially solved in the RAM model: you just traverse the entire list with a counter. But this pointer jumping wouldn't work well in the external memory setting because the list nodes are stored arbitrarily, and in the worst case, reading each new node may require reading a new block. ### Algorithm -Consider a slightly more general version of the problem. Now, each element has a *weight* $w_i$, and for each element we need to compute the sum of weights of all preceding elements instead of just its rank. To solve the initial problem, we can just set all weights equal to 1. +Consider a slightly more general version of the problem. Now, each element has a *weight* $w_i$, and for each element, we need to compute the sum of the weights of all its preceding elements instead of just its rank. To solve the initial problem, we can just set all weights equal to 1. -Now, the key idea of the algorithm is to remove some fraction of elements, recursively solve the problem, and then use it to reconstruct the answer for the initial problem. +The main idea of the algorithm is to remove some fraction of elements, recursively solve the problem, and then use these weight-ranks to reconstruct the answer for the initial problem — which is the tricky part. -Consider some three consecutive elements: $x$, $y$ and $z$. Assume that we deleted $y$ and solved the problem for the remaining list, which included $x$ and $z$, and now we need to restore the answer for the original triplet. The weight of $x$ would be correct as it is, but we need to calculate the answer for $y$ and adjust it for $z$, namely: +Consider some three consecutive elements $x$, $y$ and $z$. Assume that we deleted $y$ and solved the problem for the remaining list, which included $x$ and $z$, and now we need to restore the answer for the original triplet. The weight of $x$ would be correct as it is, but we need to calculate the answer for $y$ and adjust it for $z$, namely: - $w_y' = w_y + w_x$ - $w_z' = w_z + w_y + w_x$ -Now, we can just delete, say, first element, solve the problem recursively, and recalculate weights for the original array. But, unfortunately, it would work in quadratic time, because to make the update, we would need to know where its neighbors are, and since we can't hold the entire array in memory, we would need to scan it each time. +Now, we can just delete, say, the first element, solve the problem recursively, and recalculate weights for the original array. But, unfortunately, it would work in quadratic time, because to make the update, we would need to know where its neighbors are, and since we can't hold the entire array in memory, we would need to scan it each time. Therefore, on each step, we want to remove as many elements as possible. But we also have a constraint: we can't remove two consecutive elements because then merging results wouldn't be that simple. @@ -35,23 +32,29 @@ $$ T(N) = T\left(\frac{3}{4} N\right) = O(N) $$ -The only tricky part here is how to implement the merge step in external memory. +The only tricky part here is how to implement the merge step in external memory. To do it efficiently, we need to maintain our list in the following form: -To do it efficiently, we need to maintain our list in the following form: - List of tuples $(i, j)$ indicating that element $j$ follows after element $i$ - List of tuples $(i, w_i)$ indicating that element $i$ currently has weight $w_i$ - A list of deleted elements Now, to restore the answer after randomly deleting some elements and recursively solving the smaller problem, we need to iterate over all lists using three pointers looking for deleted elements. and for each such element, we will write $(j, w_i)$ to a separate table, which would signify that before the recursive step we need to add $w_i$ to $j$. We can then join this new table with initial weights, add these additional weights to them. -After coming back from recursion, we need to update weights for the deleted elements, which we can do with the same technique, iterating over reversed connections instead of direct ones. +After coming back from the recursion, we need to update weights for the deleted elements, which we can do with the same technique, iterating over reversed connections instead of direct ones. -I/O complexity of this algorithm with therefore be the same as joining, namely $SORT(N)$. +I/O complexity of this algorithm with therefore be the same as joining, namely $SORT(N) = O\left(\frac{N}{B} \log_{\frac{M}{B}} \frac{N}{M} \right)$. ### Applications List ranking is especially useful in graph algorithms. -For example, we can obtain the euler tour of a tree in external memory by constructing a linked list where, for each edge, we add two copies of it, one for each direction. Then we can apply the list ranking algorithm and get the position of each node which will be the same as its number (*tin*) in the euler tour. +For example, we can obtain the Euler tour of a tree in external memory by constructing a linked list from the tree that corresponds to its Euler tour and then applying the list ranking algorithm — the ranks of each node will be the same as its index $tin_v$ in the Euler tour. To construct this list, we need to: + +- split each undirected edge into two directed ones; +- duplicate the parent node for each up-edge (because list nodes can only have one incoming edge, but we visit some vertices multiple times); +- route each such node either to the "next sibling," if it has one, or otherwise to its own parent; +- and then finally break the resulting cycle at the root. + +This general technique is called *tree contraction*, and it serves as the basis for a large number of tree algorithms. -Exactly same approach cay be applied to parallel algorithms, but we will cover that more deeply later. +The same approach can be applied to parallel algorithms, and we will cover that much more deeply in part II. diff --git a/content/english/hpc/external-memory/locality.md b/content/english/hpc/external-memory/locality.md index d7ea4af9..e61cb5a3 100644 --- a/content/english/hpc/external-memory/locality.md +++ b/content/english/hpc/external-memory/locality.md @@ -1,82 +1,97 @@ --- -title: Spacial and Temporal Locality +title: Spatial and Temporal Locality weight: 8 --- + -## Data Locality +To precisely assess the performance of an algorithm in terms of its memory operations, we need to take into account multiple characteristics of the cache system: the number of cache layers, the [memory and block sizes](../hierarchy) of each layer, the exact [strategy](../policies) used for cache eviction by each layer, and sometimes even the details of the [memory paging](../virtual) mechanism. -Abstracting away from the minor details of the cache system helps a lot when designing algorithms. Instead of calculating theoretical cache hit rates, it often makes more sense to reason about cache performance in more abstract qualitative terms. +Abstracting away from all these minor details helps a lot in the first stages of designing algorithms. Instead of calculating theoretical cache hit rates, it often makes more sense to reason about cache performance in more qualitative terms. -We can talk about the degree of cache reuse primarily in two ways: + -We will now go through some examples to show how these concepts can help in optimization. +In this context, we can talk about the degree of cache reuse primarily in two ways: -### Depth-First and Breadth-First +- *Temporal locality* refers to the repeated access of the same data within a relatively small time period, such that the data likely remains cached between the requests. +- *Spatial locality* refers to the use of elements relatively close to each other in terms of their memory locations, such that they are likely fetched in the same memory block. -Consider a divide-and-conquer algorithm such as merge sort. There are two approaches to implementing it: +In other words, temporal locality is when it is likely that this same memory location will soon be requested again, while spatial locality is when it is likely that a nearby location will be requested right after. -- We can implement it recursively, or "depth-first", the way it is normally implemented: sort the left half, sort the right half, and then merge the results. -- We can implement it iteratively, or "breadth-first": do the lowest "layer" first, looping through the entire dataset and comparing odd elements with even elements, then merge the first two elements with the second two elements, the third two elements with the fourth two elements and so on. +In this section, we will do some case studies to show how these high-level concepts can help in practical optimization. + +### Depth-First vs. Breadth-First + +Consider a divide-and-conquer algorithm such as merge sorting. There are two approaches to implementing it: + +- We can implement it recursively, or "depth-first," the way it is normally implemented: sort the left half, sort the right half and then merge the results. +- We can implement it iteratively, or "breadth-first:" do the lowest "layer" first, looping through the entire dataset and comparing odd elements with even elements, then merge the first two elements with the second two elements, the third two elements with the fourth two elements and so on. It seems like the second approach is more cumbersome, but faster — because recursion is always slow, right? -But this is not the case for this and many similar divide-and-conquer algorithms. Although the iterative approach has the advantage of only doing sequential I/O, the recursive approach has much better temporal locality: when a segment fully fits into cache, it stays there for all lower layers of recursion, resulting in better access times later on. +Generally, recursion is [indeed slow](/hpc/architecture/functions), but this is not the case for this and many similar divide-and-conquer algorithms. Although the iterative approach has the advantage of only doing sequential I/O, the recursive approach has much better temporal locality: when a segment fully fits into the cache, it stays there for all lower layers of recursion, resulting in better access times later on. -In fact, since we only need $O(\log \frac{N}{M})$ layers until this happens, we would only need to read $O(\frac{N}{B} \log \frac{N}{M})$ blocks in total, while in the iterative approach the entire array will be read from scratch $O(\log N)$ times no matter what. The results in the speedup of $O(\frac{\log N}{\log N - \log M})$, which may be up to an order of magnitude. +In fact, since we only need to split the array $O(\log \frac{N}{M})$ times until this happens, we would only need to read $O(\frac{N}{B} \log \frac{N}{M})$ blocks in total, while in the iterative approach the entire array will be read from scratch $O(\log N)$ times no matter what. This results in the speedup of $O(\frac{\log N}{\log N - \log M})$, which may be up to an order of magnitude. -In practice, there is still some overhead associated with the recursion, and for of this reason, it makes sense to use hybrid algorithms where we don't go all the way down to the base case and instead switch to the iterative code on lower levels of recursion. +In practice, there is still some overhead associated with the recursion, and for this reason, it makes sense to use hybrid algorithms where we don't go all the way down to the base case and instead switch to the iterative code on the lower levels of recursion. ### Dynamic Programming -A similar reasoning can be applied to the implementations of dynamic programming algorithms. - -Consider the classic knapsack problem, where we got $n$ items with integer costs $c_i$, and we need to pick a subset of items with maximum total cost that does not exceed a given constant $w$. +Similar reasoning can be applied to the implementations of dynamic programming algorithms but leading to the reverse result. Consider the classic *knapsack problem:* given $N$ items with positive integer costs $c_i$, pick a subset of items with the maximum total cost that does not exceed a given constant $W$. -The way to solve it is to introduce the state of dynamic $f[i, k]$, which corresponds to the maximum total cost less than $k$ can be achieved having already considered and excluded the first $i$ items. It can be updated in $O(1)$ time per entry, by either taking or not taking the $i$-th item and using further states of the dynamic to compute the optimal decision for each state. +The way to solve it is to introduce the *state* $f[n, w]$, which corresponds to the maximum total cost not exceeding $w$ that can be achieved using only the first $n$ items. These values can be computed in $O(1)$ time per entry if we consider either taking or not taking the $n$-th item and using the previous states of the dynamic to make the optimal decision. -Python has a handy `lru_cache` decorator for implementing it with memoized recursion: +Python has a handy `lru_cache` decorator which can be used for implementing it with memoized recursion: ```python @lru_cache -def f(i, k): - if i == n or k == 0: +def f(n, w): + # check if we have no items to choose + if n == 0: return 0 - if w[i] > k: - return f(i + 1, k) - return max(f(i + 1, k), c[i] + f(i + 1, k - w[i])) + + # check if we can't pick the last item (note zero-based indexing) + if c[n - 1] > w: + return f(n - 1, w) + + # otherwise, we can either pick the last item or not + return max(f(n - 1, w), c[n - 1] + f(n - 1, w - c[n - 1])) ``` -When computing $f[n, w]$, the recursion may possibly visit $O(n \cdot w)$ different states, which is asymptotically efficient, but rather slow in reality. Even after nullifying the overhead of Python recursion and the hash table queries required for the LRU cache to work, it would still be slow because it does random I/O throughout most of the execution. +When computing $f[N, W]$, the recursion may visit up to $O(N \cdot W)$ different states, which is asymptotically efficient, but rather slow in reality. Even after nullifying the overhead of Python recursion and all the [hash table queries](../policies/#implementing-caching) required for the LRU cache to work, it would still be slow because it does random I/O throughout most of the execution. -What we can do instead is to create a 2d array for the dynamic and replace memoized recursion with a nice nested loop like this: +What we can do instead is to create a two-dimensional array for the dynamic and replace the recursion with a nice nested loop like this: ```cpp -int f[N + 1][W + 1]; +int f[N + 1][W + 1] = {0}; // this zero-fills the array -for (int i = n - 1; i >= 0; i++) - for (int k = 0; k <= W; k++) - f[i][k] = w[i] > k ? f[i + 1][k] : max(f[i + 1][k], c[i] + f[i + 1][k - w[i]]); +for (int n = 1; n <= N; n++) + for (int w = 0; w <= W; w++) + f[n][w] = c[n - 1] > w ? + f[n - 1][w] : + max(f[n - 1][k], c[n - 1] + f[n - 1][w - c[n - 1]]); ``` -Notice that we are only using the previous layer of the dynamic to calculate the next one. This means that if we can store one layer in cache, we would only need to write $O(\frac{n \cdot w}{B})$ blocks in external memory. +Notice that we are only using the previous layer of the dynamic to calculate the next one. This means that if we can store one layer in the cache, we would only need to write $O(\frac{N \cdot W}{B})$ blocks in external memory. -Moreover, if we only need the answer, we don't actually have to store the whole 2d array, but only the last layer. This lets us use just $O(w)$ memory by maintaining a single array of $w$ values. To simplify the code, we can slightly change the dynamic to store a binary value: whether it is possible to get the sum of exactly $k$ using the items that we have consider. This dynamic is even faster to compute: +Moreover, if we only need the answer, we don't actually have to store the whole 2d array but only the last layer. This lets us use just $O(W)$ memory by maintaining a single array of $W$ values. To simplify the code, we can slightly change the dynamic to store a binary value: whether it is possible to get the sum of exactly $w$ using the items that we have already considered. This dynamic is even faster to compute: ```cpp -bool f[W + 1] = {}; // this zero-fills the array +bool f[W + 1] = {0}; f[0] = 1; -for (int i = 0; i < n; i++) - for (int x = W - a[i]; x >= 0; x--) - f[x + a[i]] |= f[x]; +for (int n = 0; n < N; n++) + for (int x = W - c[n]; x >= 0; x--) + f[x + c[n]] |= f[x]; ``` As a side note, now that it only uses simple bitwise operations, it can be optimized further by using a bitset: @@ -84,15 +99,15 @@ As a side note, now that it only uses simple bitwise operations, it can be optim ```cpp std::bitset b; b[0] = 1; -for (int i = 0; i < n; i++) - b |= b << c[i]; +for (int n = 0; n < N; n++) + b |= b << c[n]; ``` -Surprisingly, there is still some room for improvement, and we will come back ot this problem later. +Surprisingly, there is still some room for improvement, and we will come back to this problem later. ### Sparse Table -*Sparse table* is a *static* data structure often used for solving static RMQ problem and computing any similar *idempotent reductions* in general. It can be formally defined as a 2d array of size $\log n \times n$: +*Sparse table* is a *static* data structure that is often used for solving the *static RMQ* problem and computing any similar *idempotent range reductions* in general. It can be formally defined as a two-dimensional array of size $\log n \times n$: $$ t[k][i] = \min \{ a_i, a_{i+1}, \ldots, a_{i+2^k-1} \} @@ -100,7 +115,7 @@ $$ In plain English: we store the minimum on each segment whose length is a power of two. -Such array can be used for calculating minima on arbitrary segments in constant time, because for each segment there are two possibly overlapping segments whose sizes is the same power of two, the union of which gives the whole segment. +Such array can be used for calculating minima on arbitrary segments in constant time because for each segment we can always find two possibly overlapping segments whose sizes are the same power of two, the union of which gives the whole segment. ![](../img/sparse-table.png) @@ -113,15 +128,15 @@ int rmq(int l, int r) { // half-interval [l; r) } ``` -The `__lg` function is an intrinsic available in GCC that calculates binary logarithm of a number rounded down. Internally it uses already familiar `clz` ("count leading zeros") instruction and subtracts this count from 32 in case of a 32-bit integer, and thus takes just a few cycles. +The `__lg` function is an intrinsic available in GCC that calculates the binary logarithm of a number rounded down. Internally it uses the `clz` ("count leading zeros") instruction and subtracts this count from 32 (in case of a 32-bit integer), and thus takes just a few cycles. -The reason why I bring it up in this article is because there are multiple alternative ways it can be built, with different performance in terms of I/O operations. In general, sparse table can be built in $O(n \log n)$ time in dynamic programming fashion by iterating in the order of increasing $i$ or $k$ and applying the following identity: +The reason why I bring it up in this article is that there are multiple alternative ways it can be built, with different efficiencies in terms of memory operations. In general, a sparse table can be built in $O(n \log n)$ time in dynamic programming fashion by iterating in the order of increasing $i$ or $k$ and applying the following identity: $$ t[k][i] = \min(t[k-1][i], t[k-1][i+2^{k-1}]) $$ -Now, there are two design choices to make: whether the log-size $k$ should be the first or the second dimension, and whether to iterate over $k$ and then $i$ or the other way around. This means that there are of $2×2=4$ ways to build it, and here is the optimal one: +Now, there are two design choices to make: whether the log-size $k$ should be the first or the second dimension, and whether to iterate over $k$ and then $i$ or the other way around. This means that there are $2×2=4$ ways to build it, and here is the optimal one: ```cpp int mn[logn][maxn]; @@ -135,17 +150,17 @@ for (int l = 0; l < logn - 1; l++) This is the only combination of the memory layout and the iteration order that results in beautiful linear passes that work ~3x faster. As an exercise, consider the other three variants and think about *why* they are slower. -### Array-of-Structs and Struct-of-Arrays +### Array-of-Structs vs. Struct-of-Arrays -Suppose you want to implement a binary tree and store its fields in separate arrays like this: +Suppose that you want to implement a binary tree and store its fields in separate arrays like this: ```cpp int left_child[maxn], right_child[maxn], key[maxn], size[maxn]; ``` -Such memory layout, when we store each field separately from others, is called *struct-of-arrays* (SoA). In most cases, when implementing tree operations, you access a node and shortly after request all or most of its data. If these fields are stored separately, this would mean that they are also located in different memory blocks. It some of the requested fields are cached while others are not, you would still have to wait for the data in the lowest layer of cache to arrive. +Such memory layout, when we store each field separately from others, is called *struct-of-arrays* (SoA). In most cases, when implementing tree operations, you access a node and then shortly after all or most of its internal data. If these fields are stored separately, this would mean that they are also located in different memory blocks. If some of the requested fields happen to be are cached while the others are not, you would still have to wait for the slowest of them to be fetched. -In contrast, if it was instead stored as an array-of-structs (AoS), you would need ~4 times less block reads as all the data of the node is stored in the same block and fetched at once: +In contrast, if it was instead stored as an array-of-structs (AoS), you would need ~4 times fewer block reads as all the data of a node is stored in the same block and fetched at once: ```cpp struct Node { @@ -155,11 +170,11 @@ struct Node { Node t[maxn]; ``` -So the AoS layout is beneficial for data structures, but SoA still has good uses: while it is worse for searching, it is much better for linear scanning. +The AoS layout is usually preferred for data structures, but SoA still has good uses: while it is worse for searching, it is much better for linear scanning. -This difference in design is important in data processing applications. For example, databases can be either row-based or columnar: +This difference in design is important in data processing applications. For example, databases can be either *row-* or *column-oriented* (also called *columnar*): -- *Row-based* storage formats are used when you need to search for a limited amount of objects in a large dataset, and fetch all or most of their fields. -- *Columnar* storage formats are used for big data processing and analytics, where you need to scan through everything anyway to calculate certain statistics. +- *Row-oriented* storage formats are used when you need to search for a limited number of objects in a large dataset and/or fetch all or most of their fields. Examples: PostgreSQL, MongoDB. +- *Columnar* storage formats are used for big data processing and analytics, where you need to scan through everything anyway to calculate certain statistics. Examples: ClickHouse, Hbase. -Columnar formats have an additional advantage that you can only read the fields that you need, as different fields are stored in separate external memory regions. +Columnar formats have the additional advantage that you can only read the fields that you need, as different fields are stored in separate external memory regions. diff --git a/content/english/hpc/cpu-cache/management.md b/content/english/hpc/external-memory/management.md similarity index 100% rename from content/english/hpc/cpu-cache/management.md rename to content/english/hpc/external-memory/management.md diff --git a/content/english/hpc/external-memory/model.md b/content/english/hpc/external-memory/model.md index 0b0b33f2..9ab86eba 100644 --- a/content/english/hpc/external-memory/model.md +++ b/content/english/hpc/external-memory/model.md @@ -1,28 +1,30 @@ --- -title: Cache-Aware Model +title: External Memory Model weight: 3 --- -To reason about performance of memory-bound algorithms, we need to develop a cost model that is more sensitive to expensive block IO operations, but is not too rigorous to still be useful. +To reason about the performance of memory-bound algorithms, we need to develop a cost model that is more sensitive to expensive block I/O operations but is not too rigorous to still be useful. -In the standard RAM model, we ignore the fact that primitive operations take unequal time to complete. Most importantly, it does not differentiate between operations on different types of memory, equating a read from RAM taking ~50ns in real-time with a read from HDD taking ~5ms, or about a $10^5$ times as much. +### Cache-Aware Model -Similar in spirit, in *external memory model*, we simply ignore every operation that is not an I/O operation. More specifically, we consider one level of cache hierarchy and assume the following about the hardware and the problem: +In the [standard RAM model](/hpc/complexity), we ignore the fact that primitive operations take unequal time to complete. Most importantly, it does not differentiate between operations on different types of memory, equating a read from RAM taking ~50ns in real-time with a read from HDD taking ~5ms, or about a $10^5$ times as much. + +Similar in spirit, in the *external memory model*, we simply ignore every operation that is not an I/O operation. More specifically, we consider one level of cache hierarchy and assume the following about the hardware and the problem: - The size of the dataset is $N$, and it is all stored in *external* memory, which we can read and write in blocks of $B$ elements in a unit time (reading a whole block and just one element takes the same time). - We can store $M$ elements in *internal* memory, meaning that we can store up to $\left \lfloor \frac{M}{B} \right \rfloor$ blocks. -- We only care about I/O operations: any computations done in-between reads and writes are free. +- We only care about I/O operations: any computations done in-between the reads and the writes are free. - We additionally assume $N \gg M \gg B$. -In this model, we measure performance of the algorithm in terms of its high-level *I/O operations*, or *IOPS* — that is, the total number of blocks read or written to external memory during execution. +In this model, we measure the performance of an algorithm in terms of its high-level *I/O operations*, or *IOPS* — that is, the total number of blocks read or written to external memory during execution. -We will mostly focus on the case where the internal memory is RAM and external memory is SSD or HDD, although the underlying analysis techniques that we will develop are applicable to any layer in the cache hierarchy. Under these settings, reasonable block size $B$ is about 1MB, internal memory size $M$ is usually a few gigabytes, and $N$ is up to a few terabytes. +We will mostly focus on the case where the internal memory is RAM and the external memory is SSD or HDD, although the underlying analysis techniques that we will develop are applicable to any layer in the cache hierarchy. Under these settings, reasonable block size $B$ is about 1MB, internal memory size $M$ is usually a few gigabytes, and $N$ is up to a few terabytes. -## Array Scan +### Array Scan -As a simple example, when we calculate the sum of array by iterating through it one element at a time, we implicitly load it by chunks of $O(B)$ elements and, in terms of external memory model, process these chunks one by one: +As a simple example, when we calculate the sum of an array by iterating through it one element at a time, we implicitly load it by chunks of $O(B)$ elements and, in terms of the external memory model, process these chunks one by one: $$ \underbrace{a_1, a_2, a_3,} _ {B_1} @@ -31,14 +33,37 @@ $$ \underbrace{a_{n-3}, a_{n-2}, a_{n-1}} _ {B_{m-1}} $$ -Thus, in external memory model, the complexity of summation and other linear array scans is +Thus, in the external memory model, the complexity of summation and other linear array scans is $$ SCAN(N) \stackrel{\text{def}}{=} O\left(\left \lceil \frac{N}{B} \right \rceil \right) \; \text{IOPS} $$ -Note that, in most cases, operating systems do this automatically. Even when the data is just redirected to the standard input from a normal file, the operating system buffers its stream and reads it in blocks of ~4KB (by default). +You can implement external array scan explicitly like this: + +```c++ +FILE *input = fopen("input.bin", "rb"); + +const int M = 1024; +int buffer[M], sum = 0; + +// while the file is not fully processed +while (true) { + // read up to M of 4-byte elements from the input stream + int n = fread(buffer, 4, M, input); + // ^ the number of elements that were actually read + + // if we can't read any more elements, finish + if (n == 0) + break; + + // sum elements in-memory + for (int i = 0; i < n; i++) + sum += buffer[i]; +} - +fclose(input); +printf("%d\n", sum); +``` -Now, let's slowly build up more complex things. The goal of this article is to eventually get to *external sorting* and its interesting applications. It will be based on the standard merge sort, so we need to derive a few of its primitives first. +Note that, in most cases, operating systems do this buffering automatically. Even when the data is just redirected to the standard input from a normal file, the operating system buffers its stream and reads it in blocks of ~4KB (by default). diff --git a/content/english/hpc/external-memory/oblivious.md b/content/english/hpc/external-memory/oblivious.md index eec77cca..93c4f2fc 100644 --- a/content/english/hpc/external-memory/oblivious.md +++ b/content/english/hpc/external-memory/oblivious.md @@ -3,18 +3,18 @@ title: Cache-Oblivious Algorithms weight: 7 --- -In the context of cache hierarchies, there are two types of efficient [external memory](../external) algorithms: +In the context of the [external memory model](../model), there are two types of efficient algorithms: - *Cache-aware* algorithms that are efficient for *known* $B$ and $M$. - *Cache-oblivious* algorithms that are efficient for *any* $B$ and $M$. -For example, external merge sort is cache-aware, but not cache-oblivious: we need to know memory characteristics of the system, namely the ratio of available memory to the block size, to find the right $k$ to do k-way merge sort. +For example, [external merge sorting](../sorting) is a cache-aware, but not cache-oblivious algorithm: we need to know the memory characteristics of the system, namely the ratio of available memory to the block size, to find the right $k$ to perform $k$-way merge sort. -Cache-oblivious algorithms are interesting because they automatically become optimal for all memory levels in the cache hierarchy, and not just the one for which they were specifically tuned. In this article we will consider some of their applications in matrix calculations. +Cache-oblivious algorithms are interesting because they automatically become optimal for all memory levels in the cache hierarchy, and not just the one for which they were specifically tuned. In this article, we consider some of their applications in matrix calculations. -## Matrix Transpose +## Matrix Transposition -Assume we have a square matrix $A$ of size $N \times N$ and we need to transpose it. The naive by-definition approach would go something like this: +Assume we have a square matrix $A$ of size $N \times N$, and we need to transpose it. The naive by-definition approach would go something like this: ```cpp for (int i = 0; i < n; i++) @@ -24,11 +24,11 @@ for (int i = 0; i < n; i++) Here we used a single pointer to the beginning of the memory region instead of a 2d array to be more explicit about its memory operations. -The I/O complexity of this code is $O(N^2)$ because the writes are not sequential. If you try to swap the iteration variables it is going to be the the other way around, but the result will be the same. +The I/O complexity of this code is $O(N^2)$ because the writes are not sequential. If you try to swap the iteration variables, it will be the other way around, but the result is going to be the same. ### Algorithm -The *cache-oblivious* way relies on the following block matrix identity: +The *cache-oblivious* algorithm relies on the following block matrix identity: $$ \begin{pmatrix} @@ -47,7 +47,7 @@ It lets us solve the problem recursively using a divide-and-conquer approach: 2. Transpose each one recursively. 3. Combine results by swapping the corner result matrices. -Implementing D&C on matrices is a bit more complex than on arrays, but the main idea is the same. Instead of copying submatrices explicitly, we want to use "views" into them, and also switch to the naive method when the data starts fitting in L1 cache (or pick something small like $32 \times 32$ if you don't know it in advance). We also need to carefully handle the case when we have odd $n$ and thus can't split the matrix into 4 equal submatrices. +Implementing D&C on matrices is a bit more complex than on arrays, but the main idea is the same. Instead of copying submatrices explicitly, we want to use "views" into them, and also switch to the naive method when the data starts fitting in the L1 cache (or pick something small like $32 \times 32$ if you don't know it in advance). We also need to carefully handle the case when we have odd $n$ and thus can't split the matrix into 4 equal submatrices. ```cpp void transpose(int *a, int n, int N) { @@ -96,9 +96,9 @@ for (int i = 0; i < n; i++) c[i * n + j] += a[i * n + k] * b[k * n + j]; ``` -It needs to access $O(N^3)$ blocks in total as each scalar multiplication needs a new block read. +It needs to access $O(N^3)$ blocks in total as each scalar multiplication needs a separate block read. -Many people know that one good optimization is to transpose transpose $B$ first: +One well-known optimization is to transpose $B$ first: ```cpp for (int i = 0; i < n; i++) @@ -112,13 +112,13 @@ for (int i = 0; i < n; i++) c[i * n + j] += a[i * n + k] * b[j * n + k]; // <- note the indices ``` -Regardless of whether the transpose is done naively or with the cache-oblivious method we just developed, the matrix multiplication with one of the matrices transposed would work in $O(N^3/B + N^2)$ as all memory accesses are now sequential. +Whether the transpose is done naively or with the cache-oblivious method we previously developed, the matrix multiplication with one of the matrices transposed would work in $O(N^3/B + N^2)$ as all memory accesses are now sequential. -It seems like we can't do better, but turns out we can. +It seems like we can't do better, but it turns out we can. ### Algorithm -Cache-oblivious matrix multiplication involves essentially the same trick. We need to divide the data until it fits into lowest cache (i. e. $N^2 \leq M$). For matrix multiplication, this equates to using this formula: +Cache-oblivious matrix multiplication relies on essentially the same trick as the transposition. We need to divide the data until it fits into lowest cache (i.e., $N^2 \leq M$). For matrix multiplication, this equates to using this formula: $$ \begin{pmatrix} @@ -133,7 +133,7 @@ A_{21} B_{11} + A_{22} B_{21} & A_{21} B_{12} + A_{22} B_{22}\\ \end{pmatrix} $$ -It is slightly harder to implement though, as we now have 8 recursive matrix multiplications: +It is slightly harder to implement though because we now have a total of 8 recursive matrix multiplications: ```cpp void matmul(const float *a, const float *b, float *c, int n, int N) { @@ -198,11 +198,11 @@ $$ T(N) = O\left(\frac{(\sqrt{M})^2}{B} \cdot \left(\frac{N}{\sqrt M}\right)^3\right) = O\left(\frac{N^3}{B\sqrt{M}}\right) $$ -This is better than just $O(\frac{N^3}{B})$ by quite a lot. +This is better than just $O(\frac{N^3}{B})$, and by quite a lot. ### Strassen Algorithm -In a spirit similar to the Karatsuba algorithm, matrix multiplication can be decomposed in a way that involves 7 matrix multiplications of size $\frac{n}{2}$, and master theorem tells us the such divide-and-conquer algorithm would work in $O(n^{\log_2 7}) \approx O(n^{2.81})$ time and a similar asymptotic in external memory model. +In a spirit similar to the Karatsuba algorithm, matrix multiplication can be decomposed in a way that involves 7 matrix multiplications of size $\frac{n}{2}$, and the master theorem tells us that such divide-and-conquer algorithm would work in $O(n^{\log_2 7}) \approx O(n^{2.81})$ time and a similar asymptotic in the external memory model. This technique, known as the Strassen algorithm, similarly splits each matrix into 4: @@ -221,7 +221,7 @@ B_{21} & B_{22} \\ \end{pmatrix} $$ -It then computes intermediate products of the $\frac{N}{2} \times \frac{N}{2}$ matrices and combines them to get matrix $C$: +But then it computes intermediate products of the $\frac{N}{2} \times \frac{N}{2}$ matrices and combines them to get matrix $C$: $$ \begin{aligned} @@ -237,10 +237,10 @@ $$ You can verify these formulas with simple substitution if you feel like it. -As far as we know, none of the mainstream optimized linear algebra libraries use the Strassen algorithm, although there are some prototype implementations that become efficient for matrices larger than 4000 or so. +As far as I know, none of the mainstream optimized linear algebra libraries use the Strassen algorithm, although there are [some prototype implementations](https://arxiv.org/pdf/1605.01078.pdf) that are efficient for matrices larger than 2000 or so. This technique can and actually has been extended multiple times to reduce the asymptotic even further by considering more submatrix products. As of 2020, current world record is $O(n^{2.3728596})$. Whether you can multiply matrices in $O(n^2)$ or at least $O(n^2 \log^k n)$ time is an open problem. ## Further Reading -[Cache-Oblivious Algorithms and Data Structures](https://erikdemaine.org/papers/BRICS2002/paper.pdf) by Erik Demaine. +For a solid theoretical viewpoint, consider reading [Cache-Oblivious Algorithms and Data Structures](https://erikdemaine.org/papers/BRICS2002/paper.pdf) by Erik Demaine. diff --git a/content/english/hpc/external-memory/policies.md b/content/english/hpc/external-memory/policies.md index ef6da591..4cb36bdd 100644 --- a/content/english/hpc/external-memory/policies.md +++ b/content/english/hpc/external-memory/policies.md @@ -3,56 +3,60 @@ title: Eviction Policies weight: 6 --- -## Caching Strategies +You can control the I/O operations of your program manually, but most of the time people just rely on automatic bufferization and caching, either due to laziness or because of the computing environment limitations. -When you run out of inner memory to store your data, you need to delete one block to make space for a new one. Since caching usually happens in the background, you need a concrete rule for deciding which data to retain in the cache, called *eviction policy*. +But automatic caching comes with its own challenges. When a program runs out of working memory to store its intermediate data, it needs to get rid of one block to make space for a new one. A concrete rule for deciding which data to retain in the cache in case of conflicts is called an *eviction policy*. This rule can be arbitrary, but there are several popular choices: - First in first out (FIFO): simply evict the earliest added block, without any regard to how often it was accessed before (the same way as a FIFO queue). - Least recently used (LRU): evict the block that has not been accessed for the longest period of time. - Last in first out (LIFO) and most recently used (MRU): the opposite of the previous two. It seems harmful to delete the hottest blocks, but there are scenarios where these policies are optimal, such as repeatedly looping around a file in a cycle. -- Least-frequently used (LFU): counts how often each block has been requested, and discards the one used least often. There are variations that account for changing access patterns over time, such as using a time window to only consider the last $n$ accesses, or using exponential averaging to give recent accesses more weight. +- Least-frequently used (LFU): counts how often each block has been requested and discards the one used least often. Some variations also account for changing access patterns over time, such as using a time window to only consider the last $n$ accesses or using exponential averaging to give recent accesses more weight. - Random replacement (RR): discard a block randomly. The advantage is that it does not need to maintain any data structures with block information. -There is a natural trade-off between the accuracy of eviction policies and the additional overhead due to the complexity of their implementations. For a CPU cache, you need a simple policy that can be easily implemented in hardware with almost zero latency, while in more slow-paced and plannable settings such as Netflix deciding in which data centers to store their movies or Google Drive optimizing where to store user data, it makes sense to use more complex policies, possibly involving machine learning to predict when the data is going to be accessed next. +There is a natural trade-off between the accuracy of eviction policies and the additional overhead due to the complexity of their implementations. For a CPU cache, you need a simple policy that can be easily implemented in hardware with next-to-zero latency, while in more slow-paced and plannable settings such as Netflix deciding in which data centers to store their movies or Google Drive optimizing where to store user data, it makes sense to use more complex policies, possibly involving some machine learning to predict when the data is going to be accessed next. -### Implementing Caching +### Optimal Caching -This is not always a trivial task to find the right block to evict in a reasonable time. While CPU caches are implemented in hardware (usually as a variation of LRU), higher-level eviction policies have to rely on software to store certain statistics about the blocks and maintain data structures on top of them to speed up the process. +Apart from the aforementioned strategies, there is also the theoretical *optimal policy*, denoted as $OPT$ or $MIN$, which determines, for a given sequence of queries, which blocks should be retained to minimize the total number of cache misses. -For example, let's think about what it takes to implement an LRU cache. Assume we are storing some moderately large objects — say, we need to develop a cache for a database, there both the requests and replies are medium-sized strings in some SQL dialect, so the overhead of our structure is small, but non-negligible. +These decisions can be made using a simple greedy approach called *Bélády algorithm*: we can just keep the *latest-to-be-used* block, and it can be shown by contradiction that doing so is always one of the optimal solutions. The downside of this method is that you either need to have these queries in advance or somehow be able to predict the future. - +The good thing is that, in terms of asymptotic complexity, it doesn't really matter which particular method is used. [Sleator & Tarjan showed](https://www.cs.cmu.edu/~sleator/papers/amortized-efficiency.pdf) that in most cases, the performance of popular policies such as $LRU$ differs from $OPT$ just by a constant factor. + +**Theorem.** Let $LRU_M$ and $OPT_M$ denote the number of blocks a computer with $M$ internal memory would need to access while executing the same algorithm following the least recently used cache replacement policy and the theoretical minimum respectively. Then: + +$$ +LRU_M \leq 2 \cdot OPT_{M/2} +$$ -First of all, we need a hash table to find the data itself. Since we are working with large variable-length strings, it makes sense to use a hash of the query as the key and a pointer to the heap-allocated result string as the value. +The main idea of the proof is to consider the worst case scenario. For LRU it would be the repeating series of $\frac{M}{B}$ distinct blocks: each block is new and so LRU has 100% cache misses. Meanwhile, $OPT_{M/2}$ would be able to cache half of them (but not more, because it only has half the memory). Thus $LRU_M$ needs to fetch double the number of blocks that $OPT_{M/2}$ does, which is basically what is expressed in the inequality, and anything better for $LRU$ would only weaken it. -To implement the LRU logic, the simplest approach would be to create a queue where we put the current time and IDs/keys of objects when we access them, and also store for each object when was the last time it was accessed (not necessarily as a timestamp — any increasing counter will suffice). +![Dimmed are the blocks cached by OPT (but not cached by LRU)](../img/opt.png) -Now, when we need to free up space, we can find the least recently used object by popping elements from the front of the queue — but we can't just delete them, because it may be that they were accessed again since their record was added to the queue. So we need to check if the timestamp when we put them in queue matches the timestamp when they were last accessed, and only then free up the memory. +This is a very relieving result. It means that, at least in terms of asymptotic I/O complexity, you can just assume that the eviction policy is either LRU or OPT — whichever is easier for you — do complexity analysis with it, and the result you get will normally transfer to any other reasonable cache replacement policy. -The only problem here is that we add an entry to the queue each time a block is accessed, and only remove entries when we have a cache miss and start popping them off from the front until we have a match. This may lead to the queue overflowing, and to counter this, instead of adding an entry and forgetting about it, we can move it to the end of the queue on a cache hit right away. +### Implementing Caching -To support this, we need to implement the queue over a doubly linked list and store a pointer to the block's node in the queue in the hash table. Then, when we have a cache hit, we follow the pointer and remove the node from the linked list in constant time, and add a newer node to the end of the queue. This way, at any point in time, there would be exactly as many nodes in the queue as we have objects, and the memory overhead will be guaranteed to be constant per cache entry. + -As an exercise, try to think about ways to implement other caching strategies. It is quite fun, I assure you. +This is not always a trivial task to find the right block to evict in a reasonable time. While CPU caches are implemented in hardware (usually as some variation of LRU), higher-level eviction policies have to rely on software to store certain statistics about the blocks and maintain data structures on top of them to speed up the process. -### Optimal Caching +Let's think about what we need to implement an LRU cache. Assume we are storing some moderately large objects — say, we need to develop a cache for a database, there both the requests and replies are medium-sized strings in some SQL dialect, so the overhead of our structure is small but non-negligible. -Apart from aforementioned strategies, there is also what's called *Bélády algorithm*, often denoted as $OPT$ or $MIN$, which determined which blocks should be retained in the *optimal* policy for a given sequence of queries. + -The way it achieves it is simple: we can always greedily keep the *latest-to-be-used* block, and it can be shown by contradiction that doing so is always one of the optimal solutions. The downside of this method is that you either need to have these queries in advance or somehow be able to predict the future. +First of all, we need a hash table to find the data itself. Since we are working with large variable-length strings, it makes sense to use the hash of the query as the key and a pointer to a heap-allocated result string as the value. -But the good thing is that, in terms of asymptotic complexity, it doesn't really matter which particular method is used. [Sleator & Tarjan showed](https://www.cs.cmu.edu/~sleator/papers/amortized-efficiency.pdf) that in most cases, the performance of popular policies such as $LRU$ differs from $OPT$ just by a constant factor. +To implement the LRU logic, the simplest approach would be to create a queue where we put the current time and IDs/keys of objects when we access them, and also store when each object was accessed the last time (not necessarily as a timestamp — any increasing counter will suffice). -**Theorem.** Let $LRU_M$ and $OPT_M$ denote the number of blocks a computer with $M$ internal memory would need to access while executing the same algorithm following the least recently used cache replacement policy and the theoretical minimum respectively. Then +Now, when we need to free up space, we can find the least recently used object by popping elements from the front of the queue. We can't just delete them, because it may be that they were accessed again since their record was added to the queue. So we need to check if the time of when we put them in queue matches the time of when they were last accessed, and only then free up the memory. -$$ -LRU_M \leq 2 \cdot OPT_{M/2} -$$ +The only remaining issue here is that we add an entry to the queue each time a block is accessed, and only remove entries when we have a cache miss and start popping them off from the front until we have a match. This may lead to the queue overflowing, and to mitigate this, instead of adding an entry and forgetting about it, we can move it to the end of the queue on a cache hit right away. -The main idea of the proof is to consider the "worst case" scenario. For LRU it would be the repeating series of $\frac{M}{B}$ distinct blocks: each block is new and so LRU has 100% cache misses. Meanwhile, $OPT_{M/2}$ would be able to cache half of them (but not more, because it only has half the memory). Thus $LRU_M$ needs to fetch double the number of blocks that $OPT_{M/2}$ does, which is basically what is expressed in the inequality, and anything better for $LRU$ would only weaken it. +To support this, we need to implement the queue over a doubly-linked list and store a pointer to the block's node in the queue in the hash table. Then, when we have a cache hit, we follow the pointer and remove the node from the linked list in constant time, and add a newer node to the end of the queue. This way, at any point in time, there would be exactly as many nodes in the queue as we have objects, and the memory overhead will be guaranteed to be constant per cache entry. -![Dimmed are the blocks cached by OPT (but note cached by LRU)](../img/opt.png) +As an exercise, try to think about ways to implement other caching strategies. -This is a very relieving result. It means that, at least in terms of asymptotic I/O complexity, you can just assume that the eviction policy is either LRU or OPT — whichever is easier for you — do complexity analysis with it, and the result you get will normally transfer to any other reasonable cache replacement policy. + diff --git a/content/english/hpc/external-memory/sorting.md b/content/english/hpc/external-memory/sorting.md index 55caafa6..299da78f 100644 --- a/content/english/hpc/external-memory/sorting.md +++ b/content/english/hpc/external-memory/sorting.md @@ -1,13 +1,18 @@ --- title: External Sorting weight: 4 +published: true --- -## Merge +Now, let's try to design some actually useful algorithms for the new [external memory model](../model). Our goal in this section is to slowly build up more complex things and eventually get to *external sorting* and its interesting applications. -**Problem:** given two sorted arrays $a$ and $b$ of lengths $N$ and $M$, produce a single sorted array $c$ of length $N + M$ containing all of their elements. +The algorithm will be based on the standard merge sorting algorithm, so we need to derive its main primitive first. -The standard technique using two pointers looks like this: +### Merge + +**Problem.** Given two sorted arrays $a$ and $b$ of lengths $N$ and $M$, produce a single sorted array $c$ of length $N + M$ containing all of their elements. + +The standard two-pointer technique for merging sorted arrays looks like this: ```cpp void merge(int *a, int *b, int *c, int n, int m) { @@ -25,49 +30,47 @@ In terms of memory operations, we just linearly read all elements of $a$ and $b$ So far the examples have been simple, and their analysis doesn't differ too much from the RAM model, except that we divide the final answer by the block size $B$. But here is a case where this is not so. -**K-way merging.** Consider the modification of this algorithm where we need to merge not just two arrays, but $k$ arrays of total size $N$ — by likewise looking at $k$ values, choosing the minimum between them, writing it into $c$ and incrementing one of the iterators. +**$k$-way merging.** Consider the modification of this algorithm where we need to merge not just two arrays, but $k$ arrays of total size $N$ — by likewise looking at $k$ values, choosing the minimum between them, writing it into $c$, and incrementing one of the iterators. -In the standard RAM model, the asymptotic complexity would be multiplied $k$, since we would need to do $O(k)$ comparisons to fill each next element. But in external memory model, since everything we do in-memory doesn't cost us anything, its asymptotic complexity would not change as long as we can fit $(k+1)$ full blocks in memory, that is, if $k = O(\frac{M}{B})$. +In the standard RAM model, the asymptotic complexity would be multiplied $k$, since we would need to perform $O(k)$ comparisons to fill each next element. But in the external memory model, since everything we do in-memory doesn't cost us anything, its asymptotic complexity would not change as long as we can fit $(k+1)$ full blocks in memory, that is, if $k = O(\frac{M}{B})$. -Remember the $M \gg B$ assumption? If we have $M \geq B^{1+ε}$ for $\epsilon > 0$, then we can fit any sub-polynomial amount of blocks in memory, certainly including $O(\frac{M}{B})$. This condition is called *tall cache assumption*, and it is usually required in many other external memory algorithms. +Remember [the $M \gg B$ assumption](../model) when we introduced the computational model? If we have $M \geq B^{1+ε}$ for $\epsilon > 0$, then we can fit any sub-polynomial number of blocks in memory, certainly including $O(\frac{M}{B})$. This condition is called *tall cache assumption*, and it is usually required in many other external memory algorithms. -## Merge Sorting +### Merge Sorting -The "normal" complexity the standard mergesort algorithm is $O(N \log_2 N)$: on each of its $O(\log_2 N)$ "layers", the algorithms need to go through all $N$ elements in total and merge them in linear time. +The "normal" complexity of the standard mergesort algorithm is $O(N \log_2 N)$: on each of its $O(\log_2 N)$ "layers," the algorithms need to go through all $N$ elements in total and merge them in linear time. -In external memory model, when we read a block of size $M$, we can sort its elements "for free", since they are already in memory. This way we can split the arrays into $O(\frac{N}{M})$ blocks of consecutive elements and sort them separately as the base step, and only then merge them. +In the external memory model, when we read a block of size $M$, we can sort its elements "for free," since they are already in memory. This way we can split the arrays into $O(\frac{N}{M})$ blocks of consecutive elements and sort them separately as the base step, and only then merge them. ![](../img/k-way.png) -This effectively means that, in terms of IO operations, the first $O(\log M)$ layers of mergesort are free, and there are only $O(\log_2 \frac{N}{B})$ non-zero-cost layers, each mergeable in $O(\frac{N}{B})$ IOPS in total. This brings total I/O complexity to +This effectively means that, in terms of I/O operations, the first $O(\log M)$ layers of mergesort are free, and there are only $O(\log_2 \frac{N}{M})$ non-zero-cost layers, each mergeable in $O(\frac{N}{B})$ IOPS in total. This brings total I/O complexity to $$ -O(\frac{N}{B} \log_2 \frac{N}{M}) +O\left(\frac{N}{B} \log_2 \frac{N}{M}\right) $$ This is quite fast. If we have 1GB of memory and 10GB of data, this essentially means that we need a little bit more than 3 times the effort than just reading the data to sort it. Interestingly enough, we can do better. -### K-way Mergesort +### $k$-way Mergesort Half of a page ago we have learned that in the external memory model, we can merge $k$ arrays just as easily as two arrays — at the cost of reading them. Why don't we apply this fact here? -Let's sort each block of size $M$ in-memory just as we did before, but during each merge stage, we will split sorted blocks not just in pairs to be merged, but take as many blocks we can fit into our memory during a k-way merge. This way the height of the merge tree would be greatly reduced, while each layer would still be done in $O(\frac{N}{B})$ IOPS. +Let's sort each block of size $M$ in-memory just as we did before, but during each merge stage, we will split sorted blocks not just in pairs to be merged, but take as many blocks we can fit into our memory during a $k$-way merge. This way the height of the merge tree would be greatly reduced, while each layer would still be done in $O(\frac{N}{B})$ IOPS. -How many sorted arrays can we merge at once? Exactly $k = \frac{M}{B}$, since we need memory for one block for each array. Since the total amount of layers will be reduced to $\log_{\frac{M}{B}} \frac{N}{M}$, the whole complexity will be reduced to +How many sorted arrays can we merge at once? Exactly $k = \frac{M}{B}$, since we need memory for one block for each array. Since the total number of layers will be reduced to $\log_{\frac{M}{B}} \frac{N}{M}$, the total complexity will be reduced to $$ SORT(N) \stackrel{\text{def}}{=} O\left(\frac{N}{B} \log_{\frac{M}{B}} \frac{N}{M} \right) $$ -Note that, in our example, we have 10GB of data, 1GB of memory, and the block size is around 1MB for HDD. This makes $\frac{M}{B} = 1000$ and $\frac{N}{M} = 10$, and so the logarithm is less than one (namely, $\log_{1000} 10 = \frac{1}{3}$). Of course, we can't sort an array faster than reading it, so this analysis applies to the cases when we have very large dataset, small memory, and/or large block sizes, which happens in real life nowadays. +Note that, in our example, we have 10GB of data, 1GB of memory, and the block size is around 1MB for HDD. This makes $\frac{M}{B} = 1000$ and $\frac{N}{M} = 10$, and so the logarithm is less than one (namely, $\log_{1000} 10 = \frac{1}{3}$). Of course, we can't sort an array faster than reading it, so this analysis applies to the cases when we have a very large dataset, small memory, and/or large block sizes, which rarely happens in real life these days. ### Practical Implementation -Under more realistic constraints, instead of using $\log_{\frac{M}{B}} \frac{N}{M}$ layers, we can do just two: one for sorting data in blocks of $M$, and another one for merging all of them at once. With a gigabyte of RAM and a block size of 1MB, this would be enough to sort arrays up to a terabyte in size. +Under more realistic constraints, instead of using $\log_{\frac{M}{B}} \frac{N}{M}$ layers, we can use just two: one for sorting data in blocks of $M$ elements, and another one for merging all of them at once. This way, from the I/O operations perspective, we just loop around our dataset twice. And with a gigabyte of RAM and a block size of 1MB, this way can sort arrays up to a terabyte in size. -This way we would essentially just loop around our dataset twice. THe bandwidth of HDDs can be quite high, and we wouldn't want to stall it, so we need a slightly faster way to merge $k$ arrays than by finding minimum with $O(k)$ comparisons — namely, we can maintain for $k$ elements, and extract minimum elements from it in a manner almost identical to heapsort. - -Here is the first phase looks in C++: +Here is how the first phase looks in C++. This program opens a multi-gigabyte binary file with unsorted integers, reads it in blocks of 256MB, sorts them in memory, and then writes them back in files named `part-000.bin`, `part-001.bin`, `part-002.bin`, and so on: ```cpp const int B = (1<<20) / 4; // 1 MB blocks of integers @@ -83,7 +86,7 @@ while (true) { if (n == 0) break; - // sort in-memory + // sort a block in-memory std::sort(part, part + n); char fpart[sizeof "part-999.bin"]; @@ -102,76 +105,93 @@ while (true) { fclose(input); ``` -This would create many arrays named `part-000.bin`, `part-001.bin`, `part-002.bin` and so on. +What is left now is to merge them together. The bandwidth of modern HDDs can be quite high, and there may be a lot of parts to merge, so the I/O efficiency of this stage is not our only concern: we also need a faster way to merge $k$ arrays than by finding minima with $O(k)$ comparisons. We can do that in $O(\log k)$ time per element if we maintain a min-heap for these $k$ elements, in a manner almost identical to heapsort. -What is left now is to merge them together. First we create the an array for storing pointers to current elements of all block, their separate buffers, and a priority queue, that we populate with their first elements: +Here is how to implement it. First, we are going to need a heap (`priority_queue` in C++): -```cpp -std::priority_queue< std::pair > q; +```c++ +struct Pointer { + int key, part; // the element itself and the number of its part + + bool operator<(const Pointer& other) const { + return key > other.key; // std::priority_queue is a max-heap by default + } +}; + +std::priority_queue q; +``` + +Then, we need to allocate and fill the buffers: +```c++ const int nparts = parts.size(); -auto buffers = new int[nparts][B]; -int outbuffer[B]; -std::vector l(nparts), r(nparts); +auto buffers = new int[nparts][B]; // buffers for each part +int *l = new int[nparts], // # of already processed buffer elements + *r = new int[nparts]; // buffer size (in case it isn't full) + +// now we add fill the buffer for each part and add their elements to the heap for (int part = 0; part < nparts; part++) { + l[part] = 1; // if the element is in the heap, we also consider it "processed" r[part] = fread(buffers[part], 4, B, parts[part]); q.push({buffers[part][0], part}); - l[part] = 1; } ``` -Now we need to populate the result file until it is full, carefully writing it and reading new batches of elements when needed: +Now we just need to pop elements from the heap into the result file until it is empty, carefully writing and reading elements in batches: ```cpp FILE *output = fopen("output.bin", "w"); -int buffered = 0; + +int outbuffer[B]; // the output buffer +int buffered = 0; // number of elements in it while (!q.empty()) { auto [key, part] = q.top(); q.pop(); + // write the minimum to the output buffer outbuffer[buffered++] = key; + // check if it needs to be committed to the file if (buffered == B) { fwrite(outbuffer, 4, B, output); buffered = 0; } + // fetch a new block of that part if needed if (l[part] == r[part]) { r[part] = fread(buffers[part], 4, B, parts[part]); l[part] = 0; } + // read a new element from that part unless we've already processed all of it if (l[part] < r[part]) { q.push({buffers[part][l[part]], part}); l[part]++; } } +// write what's left of the output buffer fwrite(outbuffer, 4, buffered, output); +//clean up delete[] buffers; for (FILE *file : parts) fclose(file); - fclose(output); ``` -This implementation is not particularly effective or safe-looking (well, this is basically C), but is a good educational example of how to work with low-level memory APIs. +This implementation is not particularly effective or safe-looking (well, this is basically plain C), but is a good educational example of how to work with low-level memory APIs. -## Joining +### Joining -Sorting by mainly used not by itself, but as an intermediate step for other operations. One important real-world use case for external sorting is joining (as in "SQL join"), used in databases and other data processing applications. +Sorting is mainly used not by itself, but as an intermediate step for other operations. One important real-world use case of external sorting is joining (as in "SQL join"), used in databases and other data processing applications. **Problem.** Given two lists of tuples $(x_i, a_{x_i})$ and $(y_i, b_{y_i})$, output a list $(k, a_{x_k}, b_{y_k})$ such that $x_k = y_k$ -The optimal solution would be to sort the two lists and then use the standard two-pointer technique to merge them. The I/O complexity here would be the same as sorting, and just $O(\frac{N}{B})$ if the arrays are already sorted. - -This is why most data processing applications (databases, MapReduce systems) like to keep their tables at least partially sorted. - -### Other Implementations +The optimal solution would be to sort the two lists and then use the standard two-pointer technique to merge them. The I/O complexity here would be the same as sorting, and just $O(\frac{N}{B})$ if the arrays are already sorted. This is why most data processing applications (databases, MapReduce systems) like to keep their tables at least partially sorted. -Note that this analysis is only applicable in external memory setting — that is, if you don't have the memory to fit entire dataset. In the real world, it is important to consider alternative methods. +**Other approaches.** Note that this analysis is only applicable in the external memory setting — that is, if you don't have the memory to read the entire dataset. In the real world, alternative methods may be faster. The simplest of them is probably *hash join*, which goes something like this: @@ -183,6 +203,6 @@ def join(a, b): yield d[x] ``` -In external memory, joining two lists with a hash table would be unfeasible, as it would involve doing $O(M)$ entire block reads. +In external memory, joining two lists with a hash table would be unfeasible, as it would involve doing $O(M)$ block reads, even though only one element is used in each of them. -Another way is to use alternative sorting algorithms such as radix sort. In particular, radix sort would work in $O(\frac{N}{B} \cdot w)$ if enough memory is available to maintain a buffer possible key, which could be beneficial in the case of small keys and large datasets +Another method is to use alternative sorting algorithms such as radix sort. In particular, radix sort would work in $O(\frac{N}{B} \cdot w)$ block reads if enough memory is available to maintain buffers for all possible keys, and it could be faster in the case of small keys and large datasets. diff --git a/content/english/hpc/external-memory/virtual.md b/content/english/hpc/external-memory/virtual.md index dbfa1594..92bb454c 100644 --- a/content/english/hpc/external-memory/virtual.md +++ b/content/english/hpc/external-memory/virtual.md @@ -3,17 +3,50 @@ title: Virtual Memory weight: 2 --- -Modern operating systems give every process the impression that it is working with large, contiguous sections of memory, called *virtual memory*. Physically, the memory allocated to each process may be dispersed across different areas of physical memory, or may have been moved to another storage such as SSD or HDD. +Early operating systems gave every process the freedom of reading and modifying any memory region they want, including those allocated for other processes. While this keeps things simple, it also poses some problems: -Do achieve this, the address space of the virtual memory is divided into *pages* (typically 4KB in size), and the memory system maintains a separate hardware data structure called *page table*, which points to where the data is physically stored for each page. When a process requests access to data in its memory, the operating system maps the virtual address to the physical address through the page table and forwards the read/write request to where that data is actually stored. +- What if one of the processes is buggy or outright malicious? How do we prevent it from modifying the memory allocated for other processes while still keeping inter-process communication through memory possible? +- How do we deal with memory fragmentation? Say, we have 4MB of memory, process A allocates the first 1MB for itself, then process B claims the next 2MB, then A terminates and releases its memory, and then process C comes and asks for a contiguous 2MB region — and can't get it because we only have two separate 1MB slices. Restarting process B or somehow stopping it and shifting all its data and pointers by one megabyte doesn't seem like a good solution. +- How do we access non-RAM memory types? How do we plug a flash drive and read a specific file from it? -Since the address translation needs to be done for each memory request, this process is also cached with what's called *translation lookaside buffer* (TLB), which is just a very small cache for physical page addresses. When it doesn't hit, you essentially pay double the cost of a memory access. For this reason, some operating systems have support for larger pages (~2MB). +These problems are not that critical for some specialized computer systems such as GPUs, where you typically solve just one task at a time and have full control over the computation, but they are absolutely essential for modern multitasking operating systems — and they solve all these problems with a technique called *virtual memory*. -![From John Bell\'s OS course at University of Illinois](../img/virtual-memory.jpg) +### Memory Paging -This mechanism allows using external memory quite transparently. Operating systems have two basic mechanisms: +Virtual memory gives each process the impression that it fully controls a contiguous region of memory, which in reality may be mapped to multiple smaller blocks of the physical memory — which includes both the main memory (RAM) and external memory (HDD, SSD). -- *Swap files*, which let the operating system automatically use parts of an SDD or an HDD as an extension of RAM when there is not enough real RAM. -- [Memory mapping](https://en.wikipedia.org/wiki/Mmap), which lets you open a file a use its contents as if they were in the main memory. +![](../img/virtual-memory.jpg) -This essentially turns your RAM into "L4 cache" for the external memory, which is a good way to reason about it. +To achieve this, the memory address space is divided into *pages* (typically 4KB in size), which are the base units of memory that the programs can request from the operating system. The memory system maintains a special hardware data structure called the *page table*, which contains the mappings of virtual page addresses to the physical ones. When a process accesses data using its virtual memory address, the memory system calculates its page number (by right-shifting it by $12$ if $4096=2^{12}$ is the page size), looks up in the page table that its physical address is, and forwards the read or write request to where that data is actually stored. + +Since the address translation needs to be done for each memory request, and the number of memory pages itself may be large (e.g., 16G RAM / 4K page size = 4M pages), address translation poses a difficult problem in itself. One way to speed it up is to use a special cache for the page table itself called *translation lookaside buffer* (TLB), and the other is to [increase the page size](/hpc/cpu-cache/paging) so that the total number of memory pages is made smaller at the cost of reduced granularity. + + + +### Mapping External Memory + +The mechanism of virtual memory also allows using external memory types quite transparently. Modern operating systems support [memory mapping](https://en.wikipedia.org/wiki/Mmap), which lets you open a file and use its contents as if they were in the main memory: + +```c++ +// open a file containing 1024 random integers for reading and writing +int fd = open("input.bin", O_RDWR); +// map it into memory size allow reads and writes write changes back to the file +int* data = (int*) mmap(0, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); +// sort it like if it was a normal integer array +std::sort(data, data + 1024); +// changes are eventually propagated to the file +``` + +Here we map a 4K file, which can fit entirely on just a single memory page, but when we open larger files, its reads will be done lazily when we request a certain page, and its writes will be buffered and committed to the file system when the operating decides to (usually on the program termination or when the system runs out of RAM). + +A technique that has the same operating principle, but the reverse intention is the *swap file*, which lets the operating system automatically use parts of an SSD or an HDD as an extension of the main memory when there is not enough real RAM. This lets the systems that run out of memory just experience a terrible slowdown instead of crashing. + +This seamless integration of the main and external memory essentially turns RAM into an "L4 cache" for the external memory, which is a convenient way to think about it from the algorithm design perspective. diff --git a/content/english/hpc/number-theory/_index.md b/content/english/hpc/number-theory/_index.md index 091f476f..e91fa1fb 100644 --- a/content/english/hpc/number-theory/_index.md +++ b/content/english/hpc/number-theory/_index.md @@ -1,13 +1,40 @@ --- title: Number Theory weight: 7 -draft: true --- -In 1940, British mathematician Godfrey Harold Hardy published a famous essay titled [A Mathematician's Apology](https://en.wikipedia.org/wiki/A_Mathematician%27s_Apology) where he discusses the notion that mathematics should be pursued for its own sake rather than for the sake of its applications. As a 62-year-old, he saw the devastation caused by first world war, and was amidst the second one. +In 1940, a British mathematician [G. H. Hardy](https://en.wikipedia.org/wiki/G._H._Hardy) published a famous essay titled "[A Mathematician's Apology](https://en.wikipedia.org/wiki/A_Mathematician%27s_Apology)" discussing the notion that mathematics should be pursued for its own sake rather than for the sake of its applications. -A scientist faces a moral dilemma because some of its inventions may do more harm than good. One can find calm in pursuing useless math. Hardy himself specialized in number theory, and he was content about it not having any applications: "No one has yet discovered any warlike purpose to be served by the theory of numbers or relativity, and it seems unlikely that anyone will do so for many years". +Similar to mathematics, the various fields of computer science also form a spectrum, with mathematical logic and computability theory on one end and web programming and application development on the other. I assume that you, the reader, is more on the applied side: this book was written to show that there are way too few people working on practical algorithm design instead of theoretical computer science — and since you got to Chapter 7, you probably also believe in that statement. + +But, regardless of the personal views on the matter, one can see where Hardy is coming from. Being 62 years old at the date of writing, he witnessed the devastation caused by the First and the ongoing Second World War — which was greatly amplified by the weaponization of science. As a number theorist, Hardy finds calm working in a "useless" field and not having to face any moral dilemmas, writing: + +> No one has yet discovered any warlike purpose to be served by the theory of numbers or relativity, and it seems unlikely that anyone will do so for many years. + +Ironically, this statement was proved very wrong just 5 years later with the development of the atomic bomb, which would not have been possible without the [understanding](https://en.wikipedia.org/wiki/Einstein%E2%80%93Szil%C3%A1rd_letter) of relativity, and the inception of computer-era cryptography, which extensively builds on number theory — the computational aspect of which is the main topic of this chapter. + + diff --git a/content/english/hpc/number-theory/cryptography.md b/content/english/hpc/number-theory/cryptography.md index 87f58124..0b8c6b76 100644 --- a/content/english/hpc/number-theory/cryptography.md +++ b/content/english/hpc/number-theory/cryptography.md @@ -1,6 +1,6 @@ --- title: Cryptography -weight: 6 +weight: 7 draft: true --- @@ -22,15 +22,15 @@ To calculate $d$ and restore the message, the attacker would need to repeat step When doing actual communication, people first exchange their public keys (in any, possibly unsecure way) and then use it to encrypt messages. -This is what web browsers do when establishing connection "https". You can also do it by hand with GPG. +This is what web browsers do when establishing connection "https." You can also do it by hand with GPG. ### Man-in-the-middle There is an issue when establishing initial communication that the attacker could replace it and control the communication. -Between your browser and a bank. "Hey this is a message from a bank". +Between your browser and a bank. "Hey this is a message from a bank." -Trust networks. E. g. everyone can trust Google or whoever makes the device or operating system. +Trust networks. E.g., everyone can trust Google or whoever makes the device or operating system. ## Symmetric Cryptography diff --git a/content/english/hpc/number-theory/error-correction.md b/content/english/hpc/number-theory/error-correction.md index 91f1f472..e8774ed8 100644 --- a/content/english/hpc/number-theory/error-correction.md +++ b/content/english/hpc/number-theory/error-correction.md @@ -1,6 +1,6 @@ --- title: Error Correction -weight: 4 +weight: 6 draft: true --- diff --git a/content/english/hpc/number-theory/euclid-extended.md b/content/english/hpc/number-theory/euclid-extended.md new file mode 100644 index 00000000..a37c1b29 --- /dev/null +++ b/content/english/hpc/number-theory/euclid-extended.md @@ -0,0 +1,100 @@ +--- +title: Extended Euclidean Algorithm +weight: 3 +--- + +[Fermat’s theorem](../modular/#fermats-theorem) allows us to calculate modular multiplicative inverses through [binary exponentiation](..exponentiation/) in $O(\log n)$ operations, but it only works with prime modula. There is a generalization of it, [Euler's theorem](https://en.wikipedia.org/wiki/Euler%27s_theorem), stating that if $m$ and $a$ are coprime, then + +$$ +a^{\phi(m)} \equiv 1 \pmod m +$$ + +where $\phi(m)$ is [Euler's totient function](https://en.wikipedia.org/wiki/Euler%27s_totient_function) defined as the number of positive integers $x < m$ that are coprime with $m$. In the special case when $m$ is a prime, then all the $m - 1$ residues are coprime and $\phi(m) = m - 1$, yielding the Fermat's theorem. + +This lets us calculate the inverse of $a$ as $a^{\phi(m) - 1}$ if we know $\phi(m)$, but in turn, calculating it is not so fast: you usually need to obtain the [factorization](/hpc/algorithms/factorization/) of $m$ to do it. There is a more general method that works by modifying the [the Euclidean algorthm](/hpc/algorithms/gcd/). + +### Algorithm + +*Extended Euclidean algorithm*, apart from finding $g = \gcd(a, b)$, also finds integers $x$ and $y$ such that + +$$ +a \cdot x + b \cdot y = g +$$ + +which solves the problem of finding modular inverse if we substitute $b$ with $m$ and $g$ with $1$: + +$$ +a^{-1} \cdot a + k \cdot m = 1 +$$ + +Note that, if $a$ is not coprime with $m$, there is no solution since no integer combination of $a$ and $m$ can yield anything that is not a multiple of their greatest common divisor. + +The algorithm is also recursive: it calculates the coefficients $x'$ and $y'$ for $\gcd(b, a \bmod b)$ and restores the solution for the original number pair. If we have a solution $(x', y')$ for the pair $(b, a \bmod b)$ + +$$ +b \cdot x' + (a \bmod b) \cdot y' = g +$$ + +then, to get the solution for the initial input, we can rewrite the expression $(a \bmod b)$ as $(a - \lfloor \frac{a}{b} \rfloor \cdot b)$ and subsitute it into the aforementioned equation: + +$$ +b \cdot x' + (a - \Big \lfloor \frac{a}{b} \Big \rfloor \cdot b) \cdot y' = g +$$ + +Now we rearrange the terms grouping by $a$ and $b$ to get + +$$ +a \cdot \underbrace{y'}_x + b \cdot \underbrace{(x' - \Big \lfloor \frac{a}{b} \Big \rfloor \cdot y')}_y = g +$$ + +Comparing it with the initial expression, we infer that we can just use coefficients of $a$ and $b$ for the initial $x$ and $y$. + +### Implementation + +We implement the algorithm as a recursive function. Since its output is not one but three integers, we pass the coefficients to it by reference: + +```c++ +int gcd(int a, int b, int &x, int &y) { + if (a == 0) { + x = 0; + y = 1; + return b; + } + int x1, y1; + int d = gcd(b % a, a, x1, y1); + x = y1 - (b / a) * x1; + y = x1; + return d; +} +``` + +To calculate the inverse, we simply pass $a$ and $m$ and return the $x$ coefficient the algorithm finds. Since we pass two positive numbers, one of the coefficient will be positive and the other one is negative (which one depends on whether the number of iterations is odd or even), so we need to optionally check if $x$ is negative and add $m$ to get a correct residue: + +```c++ +int inverse(int a) { + int x, y; + gcd(a, M, x, y); + if (x < 0) + x += M; + return x; +} +``` + +It works in ~160ns — 10ns faster than inverting numbers with [binary exponentiation](../exponentiation). To optimize it further, we can similarly turn it iterative ­— which takes 135ns: + +```c++ +int inverse(int a) { + int b = M, x = 1, y = 0; + while (a != 1) { + y -= b / a * x; + b %= a; + swap(a, b); + swap(x, y); + } + return x < 0 ? x + M : x; +} +``` + +Note that, unlike binary exponentiation, the running time depends on the value of $a$. For example, for this particular value of $m$ ($10^9 + 7$), the worst input happens to be 564400443, for which the algorithm performs 37 iterations and takes 250ns. + +**Exercise**. Try to adapt the same technique for the [binary GCD](/hpc/algorithms/gcd/#binary-gcd) (it won't give performance speedup though unless you are better than me at optimization). diff --git a/content/english/hpc/number-theory/exponentiation.md b/content/english/hpc/number-theory/exponentiation.md new file mode 100644 index 00000000..8806257d --- /dev/null +++ b/content/english/hpc/number-theory/exponentiation.md @@ -0,0 +1,109 @@ +--- +title: Binary Exponentiation +weight: 2 +--- + +In modular arithmetic (and computational algebra in general), you often need to raise a number to the $n$-th power — to do [modular division](../modular/#modular-division), perform [primality tests](../modular/#fermats-theorem), or compute some combinatorial values — ­and you usually want to spend fewer than $\Theta(n)$ operations calculating it. + +*Binary exponentiation*, also known as *exponentiation by squaring*, is a method that allows for computation of the $n$-th power using $O(\log n)$ multiplications, relying on the following observation: + +$$ +\begin{aligned} + a^{2k} &= (a^k)^2 +\\ a^{2k + 1} &= (a^k)^2 \cdot a +\end{aligned} +$$ + +To compute $a^n$, we can recursively compute $a^{\lfloor n / 2 \rfloor}$, square it, and then optionally multiply by $a$ if $n$ is odd, corresponding to the following recurrence: + +$$ +a^n = f(a, n) = \begin{cases} + 1, && n = 0 +\\ f(a, \frac{n}{2})^2, && 2 \mid n +\\ f(a, n - 1) \cdot a, && 2 \nmid n +\end{cases} +$$ + +Since $n$ is at least halved every two recursive transitions, the depth of this recurrence and the total number of multiplications will be at most $O(\log n)$. + +### Recursive Implementation + +As we already have a recurrence, it is natural to implement the algorithm as a case matching recursive function: + +```c++ +const int M = 1e9 + 7; // modulo +typedef unsigned long long u64; + +u64 binpow(u64 a, u64 n) { + if (n == 0) + return 1; + if (n % 2 == 1) + return binpow(a, n - 1) * a % M; + else { + u64 b = binpow(a, n / 2); + return b * b % M; + } +} +``` + +In our benchmark, we use $n = m - 2$ so that we compute the [multiplicative inverse](../modular/#modular-division) of $a$ modulo $m$: + +```c++ +u64 inverse(u64 a) { + return binpow(a, M - 2); +} +``` + +We use $m = 10^9+7$, which is a modulo value commonly used in competitive programming to calculate checksums in combinatorial problems — because it is prime (allowing inverse via binary exponentiation), sufficiently large, not overflowing `int` in addition, not overflowing `long long` in multiplication, and easy to type as `1e9 + 7`. + +Since we use it as compile-time constant in the code, the compiler can optimize the modulo by [replacing it with multiplication](/hpc/arithmetic/division/) (even if it is not a compile-time constant, it is still cheaper to compute the magic constants by hand once and use them for fast reduction). + +The execution path — and consequently the running time — depends on the value of $n$. For this particular $n$, the baseline implementation takes around 330ns per call. As recursion introduces some [overhead](/hpc/architecture/functions/), it makes sense to unroll the implementation into an iterative procedure. + +### Iterative Implementation + +The result of $a^n$ can be represented as the product of $a$ to some powers of two — those that correspond to 1s in the binary representation of $n$. For example, if $n = 42 = 32 + 8 + 2$, then + +$$ +a^{42} = a^{32+8+2} = a^{32} \cdot a^8 \cdot a^2 +$$ + +To calculate this product, we can iterate over the bits of $n$ maintaining two variables: the value of $a^{2^k}$ and the current product after considering $k$ lowest bits of $n$. On each step, we multiply the current product by $a^{2^k}$ if the $k$-th bit of $n$ is set, and, in either case, square $a^k$ to get $a^{2^k \cdot 2} = a^{2^{k+1}}$ that will be used on the next iteration. + +```c++ +u64 binpow(u64 a, u64 n) { + u64 r = 1; + + while (n) { + if (n & 1) + r = res * a % M; + a = a * a % M; + n >>= 1; + } + + return r; +} +``` + +The iterative implementation takes about 180ns per call. The heavy calculations are the same; the improvement mainly comes from the reduced dependency chain: `a = a * a % M` needs to finish before the loop can proceed, and it can now execute concurrently with `r = res * a % M`. + +The performance also benefits from $n$ being a constant, [making all branches predictable](/hpc/pipelining/branching/) and letting the scheduler know what needs to be executed in advance. The compiler, however, does not take advantage of it and does not unroll the `while(n) n >>= 1` loop. We can rewrite it as a `for` loop that performs constant 30 iterations: + +```c++ +u64 inverse(u64 a) { + u64 r = 1; + + #pragma GCC unroll(30) + for (int l = 0; l < 30; l++) { + if ( (M - 2) >> l & 1 ) + r = r * a % M; + a = a * a % M; + } + + return r; +} +``` + +This forces the compiler to generate only the instructions we need, shaving off another 10ns and making the total running time ~170ns. + +Note that the performance depends not only on the binary length of $n$, but also on the number of binary 1s. If $n$ is $2^{30}$, it takes around 20ns less as we don't have to to perform any off-path multiplications. diff --git a/content/english/hpc/number-theory/finite.md b/content/english/hpc/number-theory/finite.md index fbef0015..cae2f2ef 100644 --- a/content/english/hpc/number-theory/finite.md +++ b/content/english/hpc/number-theory/finite.md @@ -1,6 +1,6 @@ --- title: Finite Fields -weight: 3 +weight: 5 draft: true --- diff --git a/content/english/hpc/number-theory/hashing.md b/content/english/hpc/number-theory/hashing.md index 0484d173..294573a1 100644 --- a/content/english/hpc/number-theory/hashing.md +++ b/content/english/hpc/number-theory/hashing.md @@ -12,7 +12,7 @@ Hash function is any function that is: * Computed fast — at least in linear time, that is. * Has a limited image — say, 64-bit values. -* "Deterministically-random": if it takes $n$ different values, then the probability of collision of two random hashes is $\frac{1}{n}$ and can't be predicted well without knowing the hash function. +* "Deterministically-random:" if it takes $n$ different values, then the probability of collision of two random hashes is $\frac{1}{n}$ and can't be predicted well without knowing the hash function. One good test is that can't create a collision in any better time than by birthday paradox. Square root of the hash space. diff --git a/content/english/hpc/number-theory/img/clock.gif b/content/english/hpc/number-theory/img/clock.gif new file mode 100644 index 00000000..0d0c6555 Binary files /dev/null and b/content/english/hpc/number-theory/img/clock.gif differ diff --git a/content/english/hpc/number-theory/inverse.md b/content/english/hpc/number-theory/inverse.md deleted file mode 100644 index dbfe1676..00000000 --- a/content/english/hpc/number-theory/inverse.md +++ /dev/null @@ -1,187 +0,0 @@ ---- -title: Modular Inverse -weight: 1 ---- - -```c++ -mint inv() const { - uint t = x; - uint res = 1; - while (t != 1) { - uint z = mod / t; - res = (ull) res * (mod - z) % mod; - t = mod - t * z; - } - return res; -} -``` - -In this section, we are going to discuss some preliminaries before discussing more advanced topics. - -In computers, we use the 1st of January, 1970 as the start of the "Unix era", and all time computations are usually done relative to that timestamp. - -We humans also keep track of time relative to some point in the past, which usually has a political or religious significance. At the moment of writing, approximately 63882260594 seconds have passed since 0 AD. - -But for daily tasks, we do not really need that information. Depending on the situation, the relevant part may be that it is 2 pm right now and it's time to go to dinner, or that it's Thursday and so Subway's sub of the day is an Italian BMT. What we do is instead of using a timestamp we use its remainder, which contains just the information we need. And the beautiful thing about it is that remainders are small and cyclic. Think the hour clock: after 12 there comes 1 again, so the number is always small. - -![](../img/clock.gif) - -It is much easier to deal with 1- or 2-digit numbers than 11-digit ones. If we encode each day of the weak starting with Monday from 0 to 6 inclusive, Thursday is going to get number 3. But what day of the week is it going to be in one year? We need to add 365 to it and then reduce modulo 7. It is convenient that `365 % 7` is 1, so we will know that it's Friday unless it is a leap year (in which case it will be Saturday). - -Modular arithmetic studies the way these sets of remainders behave, and it has fundamental applications in number theory, cryptography and data compression. - - -Consider the following problem: our "week" now consists of $m$ days, and we cycle through it with a steps of $a > 0$. How many distinct days there will be? - -Let's assume that the first day is always Monday. At some point the sequence of day is going to cycle. The days will be representable as $k a \mod m$, so we need to find the first $k$ such as $k a$ is divisible by $m$. In the case of $m=7$, $m$ is prime, so the cycle length will be 7 exactly for any $a$. - -Now, if $m$ is not prime, but it is still coprime with $a$. For $ka$ to be divisible by $m$, $k$ needs to be divisible by $m$. In general, the answer is $\frac{m}{gcd(a, m)}$. For example, if the week is 10 days long, if the starting number is even, then it will cycle through all even numbers, and if the number is 5, then it will only cycle between 0 and 5. Otherwise it will go through all 10 remainders. - -### Fermat's Theorem - -Now, consider what happens if instead of adding a number $a$, we repeatedly multiply by it, that is, write numbers in the form $a^n \mod m$. Since these are all finite numbers there is going to be a cycle, but what will its length be? If $p$ is prime, it turns out, all of them. - -**Theorem.** $a^p \equiv a \pmod p$ for all $a$ that are not multiple of $p$. - -**Proof**. Let $P(x_1, x_2, \ldots, x_n) = \frac{k}{\prod (x_i!)}$ be the *multinomial coefficient*, that is, the number of times the element $a_1^{x_1} a_2^{x_2} \ldots a_n^{x_n}$ would appear after the expansion of $(a_1 + a_2 + \ldots + a_n)^k$. Then - -$$ -\begin{aligned} -a^p &= (\underbrace{1+1+\ldots+1+1}_\text{$a$ times})^p & -\\\ &= \sum_{x_1+x_2+\ldots+x_a = p} P(x_1, x_2, \ldots, x_a) & \text{(by defenition)} -\\\ &= \sum_{x_1+x_2+\ldots+x_a = p} \frac{p!}{x_1! x_2! \ldots x_a!} & \text{(which terms will not be divisible by $p$?)} -\\\ &\equiv P(p, 0, \ldots, 0) + \ldots + P(0, 0, \ldots, p) & \text{(everything else will be canceled)} -\\\ &= a -\end{aligned} -$$ - -and then dividing by $a$ gives us the Fermat's theorem. - -Note that this is only true for prime $p$. Euler's theorem handles the case of arbitary $m$, and states that - -$$ -a^{\phi(m)} \equiv 1 \pmod m -$$ - -where $\phi(m)$ is called Euler's totient function and is equal to the number of residues of $m$ that is coprime with it. In particular case of when $m$ is prime, $\phi(p) = p - 1$ and we get Fermat's theorem, which is just a special case. - -### Primality Testing - -These theorems have a lot of applications. One of them is checking whether a number $n$ is prime or not faster than factoring it. You can pick any base $a$ at random and try to raise it to power $a^{p-1}$ modulo $n$ and check if it is $1$. Such base is called *witness*. - -Such probabilistic tests are therefore returning either "no" or "maybe". It may be the case that it just happened to be equal to $1$ but in fact $n$ is composite, in which case you need to repeat the test until you are okay with the false positive probability. Moreover, there exist carmichael numbers, which are composite numbers $n$ that satisfy $a^n \equiv 1 \pmod n$ for all $a$. These numbers are rare, but still [exist](https://oeis.org/A002997). - -Unless the input is provided by an adversary, the mistake probability will be low. This test is adequate for finding large primes: there are roughly $\frac{n}{\ln n}$ primes among the first $n$ numbers, which is another fact that we are not going to prove. These primes are distributed more or less evenly, so one can just pick a random number and check numbers in sequence, and after checking $O(\ln n)$ numbers one will probably be found. - -### Binary Exponentiation - -To perform the Fermat test, we need to raise a number to power $n-1$, preferrably using less than $n-2$ modular multiplications. We can use the fact that multiplication is associative: - -$$ -\begin{aligned} - a^{2k} &= (a^k)^2 -\\ a^{2k + 1} &= (a^k)^2 \cdot a -\end{aligned} -$$ - -We essentially group it like this: - -$$ -a^8 = (aaaa) \cdot (aaaa) = ((aa)(aa))((aa)(aa)) -$$ - -This allows using only $O(\log n)$ operations (or, more specifically, at most $2 \cdot \log_2 n$ modular multiplications). - -```c++ -int binpow(int a, int n) { - int res = 1; - while (n) { - if (n & 1) - res = res * a % mod; - a = a * a % mod; - n >>= 1; - } - return res; -} -``` - -This helps if `n` or `mod` is a constant. - -### Modular Division - -"Normal" operations also apply to residues: +, -, *. But there is an issue with division, because we can't just bluntly divide two numbers: $\frac{8}{2} = 4$, но $\frac{8 \\% 5 = 3}{2 \\% 5 = 2} \neq 4$. - -To perform division, we need to find an element that will behave itself like the reciprocal $\frac{1}{a} = a^{-1}$, and instead of "division" multiply by it. This element is called a *modular inverse*. - -If the modulo is a prime number, then the solution is $a^{-1} \equiv a^{p-2}$, which follows directly from Fermat's theorem by dividing the equivalence by $a$: - -$$ -a^p \equiv a \implies a^{p-1} \equiv 1 \implies a^{p-2} \equiv a^{-1} -$$ - -This means that $a^{p-2}$ "behaves" like $a^{-1}$ which is what we need. - -You can calculate $a^{p-2}$ in $O(\log p)$ time using binary exponentiation: - -```c++ -int inv(int x) { - return binpow(x, mod - 2); -} -``` - -If the modulo is not prime, then we can still get by calculating $\phi(m)$ and invoking Euler's theorem. But calculating $\phi(m)$ is as difficult as factoring it, which is not fast. There is a more general method. - -### Extended Euclidean Algorithm - -*Extended Euclidean algorithm* apart from finding $g = \gcd(a, b)$ also finds integers $x$ and $y$ such that - -$$ -a \cdot x + b \cdot y = g -$$ - -which solves the problem of finding modular inverse if we substitute $b$ with $m$ and $g$ with $1$: - -$$ -a^{-1} \cdot a + k \cdot m = 1 -$$ - -Note that if $a$ is not coprime with $m$, then there will be no solution. We can still find *some* element, but it will not work for any dividend. - -The algorithm is also recursive. It makes a recursive call, calculates the coefficients $x'$ and $y'$ for $\gcd(b, a \bmod b)$, and restores the general solution. If we have a solution $(x', y')$ for pair $(b, a \bmod b)$: - -$$ -b \cdot x' + (a \bmod b) \cdot y' = g -$$ - -To get the solution for the initial input, rewrite the expression $(a \bmod b)$ as $(a - \lfloor \frac{a}{b} \rfloor \cdot b)$ and subsitute it into the aforementioned equality: - -$$ -b \cdot x' + (a - \Big \lfloor \frac{a}{b} \Big \rfloor \cdot b) \cdot y' = g -$$ - -Now let's rearrange the terms (grouping by $a$ and $b$) to get - -$$ -a \cdot \underbrace{y'}_x + b \cdot \underbrace{(x' - \Big \lfloor \frac{a}{b} \Big \rfloor \cdot y')}_y = g -$$ - -Comparing it with initial expression, we infer that we can just use coefficients by $a$ and $b$ for the initial $x$ and $y$. - -```c++ -int gcd(int a, int b, int &x, int &y) { - if (a == 0) { - x = 0; - y = 1; - return b; - } - int x1, y1; - int d = gcd(b % a, a, x1, y1); - x = y1 - (b / a) * x1; - y = x1; - return d; -} -``` - -Another application is the exact division modulo $2^k$. - -**Exercise**. Try to adapt the technique for binary GCD. diff --git a/content/english/hpc/number-theory/modular.md b/content/english/hpc/number-theory/modular.md new file mode 100644 index 00000000..3d05e2f9 --- /dev/null +++ b/content/english/hpc/number-theory/modular.md @@ -0,0 +1,140 @@ +--- +title: Modular Arithmetic +weight: 1 +--- + + + +Computers usually store time as the number of seconds that have passed since the 1st of January, 1970 — the start of the "Unix era" — and use these timestamps in all computations that have to do with time. + +We humans also keep track of time relative to some point in the past, which usually has a political or religious significance. For example, at the moment of writing, approximately 63882260594 seconds have passed since 1 AD — [6th century Eastern Roman monks' best estimate](https://en.wikipedia.org/wiki/Anno_Domini) of the day Jesus Christ was born. + +But unlike computers, we do not always need *all* that information. Depending on the task at hand, the relevant part may be that it's 2 pm right now, and it's time to go to dinner; or that it's Thursday, and so Subway's sub of the day is an Italian BMT. Instead of the whole timestamp, we use its *remainder* containing just the information we need: it is much easier to deal with 1- or 2-digit numbers than 11-digit ones. + +**Problem.** Today is Thursday. What day of the week will be exactly in a year? + +If we enumerate each day of the week, starting with Monday, from $0$ to $6$ inclusive, Thursday gets number $3$. To find out what day it is going to be in a year from now, we need to add $365$ to it and then reduce modulo $7$. Conveniently, $365 \bmod 7 = 1$, so we know that it will be Friday unless it is a leap year (in which case it will be Saturday). + +### Residues + +**Definition.** Two integers $a$ and $b$ are said to be *congruent* modulo $m$ if $m$ divides their difference: + +$$ +m \mid (a - b) \; \Longleftrightarrow \; a \equiv b \pmod m +$$ + +For example, the 42nd day of the year is the same weekday as the 161st since $(161 - 42) = 119 = 17 \times 7$. + +Congruence modulo $m$ is an equivalence relation that splits all integers into equivalence classes called *residues*. Each residue class modulo $m$ may be represented by any one of its members — although we commonly use the smallest nonnegative integer of that class (equal to the remainder $x \bmod m$ for all nonnegative $x$). + + + +*Modular arithmetic* studies these sets of residues, which are fundamental for number theory. + +**Problem.** Our "week" now consists of $m$ days, and our year consists of $a$ days (no leap years). How many distinct days of the week there will be among one, two, three and so on whole years from now? + +For simplicity, assume that today is Monday, so that the initial day number $d_0$ is zero, and after each year, it changes to + +$$ +d_{k + 1} = (d_k + a) \bmod m +$$ + +After $k$ years, it will be + +$$ +d_k = k \cdot a \bmod m +$$ + +Since there are only $m$ days in a week, at some point, it will be Monday again, and the sequence of day numbers is going to cycle. The number of distinct days is the length of this cycle, so we need to find the smallest $k$ such that + +$$ +k \cdot a \equiv 0 \pmod m +$$ + +First of all, if $a \equiv 0$, it will be eternal Monday. Now, assuming the non-trivial case of $a \not \equiv 0$: + +- For a seven-day week, $m = 7$ is prime. There is no $k$ smaller than $m$ such that $k \cdot a$ is divisible by $m$ because $m$ can not be decomposed in such a product by the definition of primality. So, if $m$ is prime, we will cycle through all of $m$ weekdays. +- If $m$ is not prime, but $a$ is *coprime* with it (that is, $a$ and $m$ do not have common divisors), then the answer is still $m$ for the same reason: the divisors of $a$ do not help in zeroing out the product any faster. +- If $a$ and $m$ share some divisors, then it is only possible to get residues that are also divisible by them. For example, if the week is $m = 10$ days long, and the year has $a = 42$ or any other even number of days, then we will cycle through all even day numbers, and if the number of days is a multiple of $5$, then we will only oscillate between $0$ and $5$. Otherwise, we will go through all the $10$ remainders. + +Therefore, in general, the answer is $\frac{m}{\gcd(a, m)}$, where $\gcd(a, m)$ is the [greatest common divisor](/hpc/algorithms/gcd/) of $a$ and $m$. + +### Fermat's Theorem + +Now, consider what happens if, instead of adding a number $a$, we repeatedly multiply by it, writing out a sequence of + +$$ +d_n = a^n \bmod m +$$ + +Again, since there is a finite number of residues, there is going to be a cycle. But what will its length be? Turns out, if $m$ is prime, it will span all $(m - 1)$ non-zero residues. + +**Theorem.** For any $a$ and a prime $p$: + +$$ +a^p \equiv a \pmod p +$$ + +**Proof**. Let $P(x_1, x_2, \ldots, x_n) = \frac{k}{\prod (x_i!)}$ be the *multinomial coefficient:* the number of times the element $a_1^{x_1} a_2^{x_2} \ldots a_n^{x_n}$ appears after the expansion of $(a_1 + a_2 + \ldots + a_n)^k$. Then: + +$$ +\begin{aligned} +a^p &= (\underbrace{1+1+\ldots+1+1}_\text{$a$ times})^p & +\\\ &= \sum_{x_1+x_2+\ldots+x_a = p} P(x_1, x_2, \ldots, x_a) & \text{(by definition)} +\\\ &= \sum_{x_1+x_2+\ldots+x_a = p} \frac{p!}{x_1! x_2! \ldots x_a!} & \text{(which terms will not be divisible by $p$?)} +\\\ &\equiv P(p, 0, \ldots, 0) + \ldots + P(0, 0, \ldots, p) & \text{(everything else will be canceled)} +\\\ &= a +\end{aligned} +$$ + +Note that this is only true for prime $p$. We can use this fact to test whether a given number is prime faster than by factoring it: we can pick a number $a$ at random, calculate $a^{p} \bmod p$, and check whether it is equal to $a$ or not. + +This is called *Fermat primality test*, and it is probabilistic — only returning either "no" or "maybe" — since it may be that $a^p$ just happened to be equal to $a$ despite $p$ being composite, in which case you need to repeat the test with a different random $a$ until you are satisfied with the false positive probability. + +Primality tests are commonly used to generate large primes (for cryptographic purposes). There are roughly $\frac{n}{\ln n}$ primes among the first $n$ numbers (a fact that we are not going to prove), and they are distributed more or less evenly. One can just pick a random number from the required range, perform a primality check, and repeat until a prime is found, performing $O(\ln n)$ trials on average. + +An extremely bad input to the Fermat test is the [Carmichael numbers](https://en.wikipedia.org/wiki/Carmichael_number), which are composite numbers $n$ that satisfy $a^{n-1} \equiv 1 \pmod n$ for all relatively prime $a$. But these are [rare](https://oeis.org/A002997), and the chance of randomly bumping into it is low. + +### Modular Division + +Implementing most "normal" arithmetic operations with residues is straightforward. You only need to take care of integer overflows and remember to take modulo: + +```c++ +c = (a + b) % m; +c = (a - b + m) % m; +c = a * b % m; +``` + +But there is an issue with division: we can't just bluntly divide two residues. For example, $\frac{8}{2} = 4$, but + +$$ +\frac{8 \bmod 5}{2 \bmod 5} = \frac{3}{2} \neq 4 +$$ + +To perform modular division, we need to find an element that "acts" like the reciprocal $\frac{1}{a} = a^{-1}$ and multiply by it. This element is called a *modular multiplicative inverse*, and Fermat's theorem can help us find it when the modulo $p$ is a prime. When we divide its equivalence twice by $a$, we get: + +$$ +a^p \equiv a \implies a^{p-1} \equiv 1 \implies a^{p-2} \equiv a^{-1} +$$ + +Therefore, $a^{p-2}$ is like $a^{-1}$ for the purposes of multiplication, which is what we need from a modular inverse of $a$. diff --git a/content/english/hpc/number-theory/montgomery.md b/content/english/hpc/number-theory/montgomery.md index e784dfaf..0eeef0b0 100644 --- a/content/english/hpc/number-theory/montgomery.md +++ b/content/english/hpc/number-theory/montgomery.md @@ -1,102 +1,208 @@ --- title: Montgomery Multiplication -weight: 2 +weight: 4 +published: true --- -When we talked about [integers](../integer) in general, we discussed how to perform division and modulo by multiplication, and, unsurprisingly, in modular arithmetic 90% of its time is spent calculating modulo. Apart from using the general tricks described in the previous article, there is another method specifically for modular arithmetic, called *Montgomery multiplication*. +Unsurprisingly, a large fraction of computation in [modular arithmetic](../modular) is often spent on calculating the modulo operation, which is as slow as [general integer division](/hpc/arithmetic/division/) and typically takes 15-20 cycles, depending on the operand size. -As all other fast reduction methods, it doesn't come for free. It works only in *Montgomery space*, so we need to transform our numbers in and out of it before doing the multiplications. This means that on top of doing some compile-time computations, we would also need to do some operations before the multiplication. +The best way to deal this nuisance is to avoid modulo operation altogether, delaying or replacing it with [predication](/hpc/pipelining/branchless), which can be done, for example, when calculating modular sums: -For the space we need a positive integer $r \ge n$ coprime to $n$. In practice we always choose $r$ to be $2^m$ (with $m$ usually being equal 32 or 64), since multiplications, divisions and modulo $r$ operations can then be efficiently implemented using shifts and bitwise operations. Therefore $n$ needs to be an odd number so that every power of $2$ will be coprime to $n$. And if it is not, we can make it odd (?). +```cpp +const int M = 1e9 + 7; -The representative $\bar x$ of a number $x$ in the Montgomery space is defined as +// input: array of n integers in the [0, M) range +// output: sum modulo M +int slow_sum(int *a, int n) { + int s = 0; + for (int i = 0; i < n; i++) + s = (s + a[i]) % M; + return s; +} + +int fast_sum(int *a, int n) { + int s = 0; + for (int i = 0; i < n; i++) { + s += a[i]; // s < 2 * M + s = (s >= M ? s - M : s); // will be replaced with cmov + } + return s; +} + +int faster_sum(int *a, int n) { + long long s = 0; // 64-bit integer to handle overflow + for (int i = 0; i < n; i++) + s += a[i]; // will be vectorized + return s % M; +} +``` + +However, sometimes you only have a chain of modular multiplications, and there is no good way to eel out of computing the remainder of the division — other than with the [integer division tricks](../hpc/arithmetic/division/) requiring a constant modulo and some precomputation. + +But there is another technique designed specifically for modular arithmetic, called *Montgomery multiplication*. + +### Montgomery Space + +Montgomery multiplication works by first transforming the multipliers into *Montgomery space*, where modular multiplication can be performed cheaply, and then transforming them back when their actual values are needed. Unlike general integer division methods, Montgomery multiplication is not efficient for performing just one modular reduction and only becomes worthwhile when there is a chain of modular operations. + +The space is defined by the modulo $n$ and a positive integer $r \ge n$ coprime to $n$. The algorithm involves modulo and division by $r$, so in practice, $r$ is chosen to be $2^{32}$ or $2^{64}$, so that these operations can be done with a right-shift and a bitwise AND respectively. + + + +**Definition.** The *representative* $\bar x$ of a number $x$ in the Montgomery space is defined as $$ \bar{x} = x \cdot r \bmod n $$ -Note that the transformation is actually such a multiplication that we want to optimize, so it is still an expensive operation. However, we will only need to transform a number into the space once, perform as many operations as we want efficiently in that space and at the end transform the final result back, which should be profitable if we are doing lots of operations modulo $n$. +Computing this transformation involves a multiplication and a modulo — an expensive operation that we wanted to optimize away in the first place — which is why we only use this method when the overhead of transforming numbers to and from the Montgomery space is worth it and not for general modular multiplication. + + + +Inside the Montgomery space, addition, substraction, and checking for equality is performed as usual: + +$$ +x \cdot r + y \cdot r \equiv (x + y) \cdot r \bmod n +$$ -Inside the Montgomery space addition, substraction and checking for equality is performed as usual ($x \cdot r + y \cdot r \equiv (x + y) \cdot r \bmod n$). However, this is not the case for multiplication. Denoting multiplication in Montgomery space as $*$ and normal multiplication as $\cdot$, we expect the result to be: +However, this is not the case for multiplication. Denoting multiplication in the Montgomery space as $*$ and the "normal" multiplication as $\cdot$, we expect the result to be: $$ \bar{x} * \bar{y} = \overline{x \cdot y} = (x \cdot y) \cdot r \bmod n $$ -But the normal multiplication will give us: +But the normal multiplication in the Montgomery space yields: $$ \bar{x} \cdot \bar{y} = (x \cdot y) \cdot r \cdot r \bmod n $$ -Therefore the multiplication in the Montgomery space is defined as +Therefore, the multiplication in the Montgomery space is defined as $$ \bar{x} * \bar{y} = \bar{x} \cdot \bar{y} \cdot r^{-1} \bmod n $$ -This means that whenever we multiply two numbers, after the multiplication we need to *reduce* them. Therefore, we need to have an efficient way of calculating $x \cdot r^{-1} \bmod n$. +This means that, after we normally multiply two numbers in the Montgomery space, we need to *reduce* the result by multiplying it by $r^{-1}$ and taking the modulo — and there is an efficent way to do this particular operation. ### Montgomery reduction -Assume that $r=2^{64}$, the modulo $n$ is 64-bit and the number $x$ we need to reduce (multiply by $r^{-1}$) is 128-bit (the product of two 64-bit numbers). +Assume that $r=2^{32}$, the modulo $n$ is 32-bit, and the number $x$ we need to reduce is 64-bit (the product of two 32-bit numbers). Our goal is to calculate $y = x \cdot r^{-1} \bmod n$. -Because $\gcd(n, r) = 1$, we know that there are two numbers $r^{-1}$ and $n'$ in the $[0, n)$ range such that +Since $r$ is coprime with $n$, we know that there are two numbers $r^{-1}$ and $n^\prime$ in the $[0, n)$ range such that $$ -r \cdot r^{-1} + n \cdot n' = 1 +r \cdot r^{-1} + n \cdot n^\prime = 1 $$ -and both $r^{-1}$ and $n'$ can be computed using the extended Euclidean algorithm. +and both $r^{-1}$ and $n^\prime$ can be computed, e.g., using the [extended Euclidean algorithm](../euclid-extended). -Using this identity we can express $r \cdot r^{-1}$ as $(-n \cdot n' + 1)$ and write $x \cdot r^{-1}$ as +Using this identity, we can express $r \cdot r^{-1}$ as $(1 - n \cdot n^\prime)$ and write $x \cdot r^{-1}$ as $$ \begin{aligned} x \cdot r^{-1} &= x \cdot r \cdot r^{-1} / r -\\ &= x \cdot (-n \cdot n^{\prime} + 1) / r -\\ &= (-x \cdot n \cdot n^{\prime} + x) / r -\\ &\equiv (-x \cdot n \cdot n^{\prime} + l \cdot r \cdot n + x) / r \bmod n -\\ &\equiv ((-x \cdot n^{\prime} + l \cdot r) \cdot n + x) / r \bmod n +\\ &= x \cdot (1 - n \cdot n^{\prime}) / r +\\ &= (x - x \cdot n \cdot n^{\prime} ) / r +\\ &\equiv (x - x \cdot n \cdot n^{\prime} + k \cdot r \cdot n) / r &\pmod n &\;\;\text{(for any integer $k$)} +\\ &\equiv (x - (x \cdot n^{\prime} - k \cdot r) \cdot n) / r &\pmod n \end{aligned} $$ -The equivalences hold for any integer $l$. This means that we can add or subtract an arbitrary multiple of $r$ to $x \cdot n'$, or in other words, we can compute $q = x \cdot n'$ modulo $r$. +Now, if we choose $k$ to be $\lfloor x \cdot n^\prime / r \rfloor$ (the upper 64 bits of the $x \cdot n^\prime$ product), it will cancel out, and $(k \cdot r - x \cdot n^{\prime})$ will simply be equal to $x \cdot n^{\prime} \bmod r$ (the lower 32 bits of $x \cdot n^\prime$), implying: + +$$ +x \cdot r^{-1} \equiv (x - x \cdot n^{\prime} \bmod r \cdot n) / r +$$ + +The algorithm itself just evaluates this formula, performing two multiplications to calculate $q = x \cdot n^{\prime} \bmod r$ and $m = q \cdot n$, and then subtracts it from $x$ and right-shifts the result to divide it by $r$. + +The only remaining thing to handle is that the result may not be in the $[0, n)$ range; but since + +$$ +x < n \cdot n < r \cdot n \implies x / r < n +$$ + +and + +$$ +m = q \cdot n < r \cdot n \implies m / r < n +$$ + +it is guaranteed that + +$$ +-n < (x - m) / r < n +$$ + +Therefore, we can simply check if the result is negative and in that case, add $n$ to it, giving the following algorithm: -This gives us the following algorithm to compute $x \cdot r^{-1} \bmod n$: +```c++ +typedef __uint32_t u32; +typedef __uint64_t u64; -```python -def reduce(x): - q = (x % r) * nr % r - a = (x - q * n) / r - if a < 0: - a += n - return a +const u32 n = 1e9 + 7, nr = inverse(n, 1ull << 32); + +u32 reduce(u64 x) { + u32 q = u32(x) * nr; // q = x * n' mod r + u64 m = (u64) q * n; // m = q * n + u32 y = (x - m) >> 32; // y = (x - m) / r + return x < m ? y + n : y; // if y < 0, add n to make it be in the [0, n) range +} ``` -Since $x < n \cdot n < r \cdot n$ (as $x$ is a product of multiplicatio) and $q \cdot n < r \cdot n$, we know that $-n < (x - q \cdot n) / r < n$. Therefore the final modulo operation can be implemented using a single bound check and addition. +This last check is relatively cheap, but it is still on the critical path. If we are fine with the result being in the $[0, 2 \cdot n - 2]$ range instead of $[0, n)$, we can remove it and add $n$ to the result unconditionally: + +```c++ +u32 reduce(u64 x) { + u32 q = u32(x) * nr; + u64 m = (u64) q * n; + u32 y = (x - m) >> 32; + return y + n +} +``` + +We can also move the `>> 32` operation one step earlier in the computation graph and compute $\lfloor x / r \rfloor - \lfloor m / r \rfloor$ instead of $(x - m) / r$. This is correct because the lower 32 bits of $x$ and $m$ are equal anyway since + +$$ +m = x \cdot n^\prime \cdot n \equiv x \pmod r +$$ + +But why would we voluntarily choose to perfom two right-shifts instead of just one? This is beneficial because for `((u64) q * n) >> 32` we need to do a 32-by-32 multiplication and take the upper 32 bits of the result (which the x86 `mul` instruction [already writes](../hpc/arithmetic/integer/#128-bit-integers) in a separate register, so it doesn't cost anything), and the other right-shift `x >> 32` is not on the critical path. + +```c++ +u32 reduce(u64 x) { + u32 q = u32(x) * nr; + u32 m = ((u64) q * n) >> 32; + return (x >> 32) + n - m; +} +``` -Here is an equivalent C implementation for 64-bit integers: +One of the main advantages of Montgomery multiplication over other modular reduction methods is that it doesn't require very large data types: it only needs a $r \times r$ multiplication that extracts the lower and higher $r$ bits of the result, which [has special support](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7395,7392,7269,4868,7269,7269,1820,1835,6385,5051,4909,4918,5051,7269,6423,7410,150,2138,1829,1944,3009,1029,7077,519,5183,4462,4490,1944,5055,5012,5055&techs=AVX,AVX2&text=mul) on most hardware also makes it easily generalizable to [SIMD](../hpc/simd/) and larger data types: ```c++ -u64 reduce(u128 x) { +typedef __uint128_t u128; + +u64 reduce(u128 x) const { u64 q = u64(x) * nr; u64 m = ((u128) q * n) >> 64; - u64 xhi = (x >> 64); - if (xhi >= m) - return (xhi - m); - else - return (xhi - m) + n; + return (x >> 64) + n - m; } ``` -We also need to implement calculating calculating the inverse of $n$ (`nr`) and transformation of numbers in and our of Montgomery space. Before providing complete implementation, let's discuss how to do that smarter, although they are just done once. +Note that a 128-by-64 modulo is not possible with general integer division tricks: the compiler [falls back](https://godbolt.org/z/fbEE4v4qr) to calling a slow [long arithmetic library function](https://github.com/llvm-mirror/compiler-rt/blob/69445f095c22aac2388f939bedebf224a6efcdaf/lib/builtins/udivmodti4.c#L22) to support it. + +### Faster Inverse and Transform -To transfer a number back from the Montgomery space we can just use Montgomery reduction. +Montgomery multiplication itself is fast, but it requires some precomputation: -### Fast inverse +- inverting $n$ modulo $r$ to compute $n^\prime$, +- transforming a number *to* the Montgomery space, +- transforming a number *from* the Montgomery space. -For computing the inverse $n' = n^{-1} \bmod r$ more efficiently, we can use the following trick inspired from the Newton's method: +The last operation is already efficiently performed with the `reduce` procedure we just implemented, but the first two can be slightly optimized. + +**Computing the inverse** $n^\prime = n^{-1} \bmod r$ can be done faster than with the extended Euclidean algorithm by taking advantage of the fact that $r$ is a power of two and using the following identity: $$ a \cdot x \equiv 1 \bmod 2^k @@ -106,7 +212,7 @@ a \cdot x \cdot (2 - a \cdot x) 1 \bmod 2^{2k} $$ -This can be proven this way: +Proof: $$ \begin{aligned} @@ -119,47 +225,69 @@ a \cdot x \cdot (2 - a \cdot x) \end{aligned} $$ -This means we can start with $x = 1$ as the inverse of $a$ modulo $2^1$, apply the trick a few times and in each iteration we double the number of correct bits of $x$. - -### Fast transformation +We can start with $x = 1$ as the inverse of $a$ modulo $2^1$ and apply this identity exactly $\log_2 r$ times, each time doubling the number of bits in the inverse — somewhat reminiscent of [the Newton's method](../hpc/arithmetic/newton/). -Although we can just multiply a number by $r$ and compute one modulo the usual way, there is a faster way that makes use of the following relation: +**Transforming** a number into the Montgomery space can be done by multiplying it by $r$ and computing modulo [the usual way](../hpc/arithmetic/division/), but we can also take advantage of this relation: $$ \bar{x} = x \cdot r \bmod n = x * r^2 $$ -Transforming a number into the space is just a multiplication inside the space of the number with $r^2$. Therefore we can precompute $r^2 \bmod n$ and just perform a multiplication and reduction instead. +Transforming a number into the space is just a multiplication by $r^2$. Therefore, we can precompute $r^2 \bmod n$ and perform a multiplication and reduction instead — which may or may not be actually faster because multiplying a number by $r=2^{k}$ can be implemented with a left-shift, while multiplication by $r^2 \bmod n$ can not. ### Complete Implementation +It is convenient to wrap everything into a single `constexpr` structure: + ```c++ -// TODO fix me and prettify me -struct montgomery { - u64 n, nr; +struct Montgomery { + u32 n, nr; - montgomery(u64 n) : n(n) { - nr = 1; - for (int i = 0; i < 6; i++) + constexpr Montgomery(u32 n) : n(n), nr(1) { + // log(2^32) = 5 + for (int i = 0; i < 5; i++) nr *= 2 - n * nr; } - u64 reduce(u128 x) { - u64 q = u64(x) * nr; - u64 m = ((u128) q * n) >> 64; - u64 xhi = (x >> 64); - if (xhi >= m) - return (xhi - m); - else - return (xhi - m) + n; + u32 reduce(u64 x) const { + u32 q = u32(x) * nr; + u32 m = ((u64) q * n) >> 32; + return (x >> 32) + n - m; + // returns a number in the [0, 2 * n - 2] range + // (add a "x < n ? x : x - n" type of check if you need a proper modulo) } - u64 mult(u64 x, u64 y) { - return reduce((u128) x * y); + u32 multiply(u32 x, u32 y) const { + return reduce((u64) x * y); } - u64 transform(u64 x) { - return (u128(x) << 64) % n; + u32 transform(u32 x) const { + return (u64(x) << 32) % n; + // can also be implemented as multiply(x, r^2 mod n) } }; ``` + +To test its performance, we can plug Montgomery multiplication into the [binary exponentiation](../hpc/number-theory/exponentiation/): + +```c++ +constexpr Montgomery space(M); + +int inverse(int _a) { + u64 a = space.transform(_a); + u64 r = space.transform(1); + + #pragma GCC unroll(30) + for (int l = 0; l < 30; l++) { + if ( (M - 2) >> l & 1 ) + r = space.multiply(r, a); + a = space.multiply(a, a); + } + + return space.reduce(r); +} +``` + +While vanilla binary exponentiation with a compiler-generated fast modulo trick requires ~170ns per `inverse` call, this implementation takes ~166ns, going down to ~158ns we omit `transform` and `reduce` (a reasonable use case is for `inverse` to be used as a subprocedure in a bigger modular computation). This is a small improvement, but Montgomery multiplication becomes much more advantageous for SIMD applications and larger data types. + +**Exercise.** Implement efficient *modular* [matix multiplication](/hpc/algorithms/matmul). diff --git a/content/english/hpc/parallel/concurrency/fibers.md b/content/english/hpc/parallel/concurrency/fibers.md index 2ec2806c..cce7b860 100644 --- a/content/english/hpc/parallel/concurrency/fibers.md +++ b/content/english/hpc/parallel/concurrency/fibers.md @@ -28,4 +28,4 @@ func main() { The way they work is that the language maintains a group of threads ready to pick up from where they left. This is called N:M scheduling. -Similar runtimes exist for other languages, e. g. for C++ and Rust. +Similar runtimes exist for other languages, e.g., for C++ and Rust. diff --git a/content/english/hpc/parallel/gpu/_index.en.md b/content/english/hpc/parallel/gpu/_index.en.md index aafb7ba1..ac2a4aa9 100644 --- a/content/english/hpc/parallel/gpu/_index.en.md +++ b/content/english/hpc/parallel/gpu/_index.en.md @@ -73,7 +73,7 @@ CUDA is available for many languages. Nice documentation can be found here: https://documen.tician.de/pycuda/index.html -If you are on Colab, go to Runtime -> Change runtime type -> Hardware accelerator and set it to "GPU". +If you are on Colab, go to Runtime -> Change runtime type -> Hardware accelerator and set it to "GPU." ```python @@ -167,7 +167,7 @@ There is also `drv.InOut` function, which makes it available for both reading an Most of the operations here are memory operations, so measuring performance here is useless. Don't worry, we will get to more complex examples soon enough. -GPUs have very specific operations. However, in case of NVIDIA GPUs managing it is quite simple: the cards have *compute capabilities* (1.0, 1.1, 1.2, 1.3, 2.0, etc.) and all features added at capability $x$ is also available at later versions. These can be checked at run-time or compile-time. +GPUs have very specific operations. However, in case of NVIDIA GPUs managing it is quite simple: the cards have *compute capabilities* (1.0, 1.1, 1.2, 1.3, 2.0, etc.) and all features added at capability $x$ is also available at later versions. These can be checked at run time or compile time. You can check differences in this Wikipedia article: https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications @@ -195,7 +195,7 @@ Some tasks, especially in cryptography, cannot be parallelized. But some can. ## Summing arrays in $O(\log n)$ time -Assume we want to perform some associative (i. e. $A*(B*C) = (A*B)*C$) operation on an array of $n$ elements. Say, sum it up. +Assume we want to perform some associative (i.e., $A*(B*C) = (A*B)*C$) operation on an array of $n$ elements. Say, sum it up. Normally, we would do that with a simple loop: @@ -418,7 +418,7 @@ Intrinsics for that. Now, a lot of value comes from cryptocurrency and deep learning. The latter relies on two specific operations: matrix multiplications for linear layers and convolutions for convolutional layers used in computer vision. -First, they introduced "multiply-accumulate" operation (e. g. `x += y * z`) per 1 GPU clock cycle. +First, they introduced "multiply-accumulate" operation (e.g., `x += y * z`) per 1 GPU clock cycle. Google uses Tensor Processing Units. Nobody really knows how they work (proprietary hardware that they rent, not sell). @@ -431,7 +431,7 @@ Well, you don't really need anything more precise than that for deep learning an It is called mixed precision because input matrices are fp16 but multiplication result and accumulator are fp32 matrices. -Probably, the proper name would be "4x4 matrix cores", however NVIDIA marketing team decided to use "tensor cores". +Probably, the proper name would be "4x4 matrix cores," however NVIDIA marketing team decided to use "tensor cores." So, see, this is not exactly fair comparison. diff --git a/content/english/hpc/pipelining/_index.md b/content/english/hpc/pipelining/_index.md index 9dc491d0..aab72d79 100644 --- a/content/english/hpc/pipelining/_index.md +++ b/content/english/hpc/pipelining/_index.md @@ -5,7 +5,7 @@ weight: 3 When programmers hear the word *parallelism*, they mostly think about *multi-core parallelism*, the practice of explicitly splitting a computation into semi-independent *threads* that work together to solve a common problem. -This type of parallelism is mainly about reducing *latency* and achieving *scalability*, but not about improving *efficiency*. You can solve a problem ten times as big with a parallel algorithm, but it would take at least ten times as many computational resources. Although parallel hardware is becoming [ever more abundant](/hpc/complexity/hardware), and parallel algorithm design is becoming an increasingly more important area, for now, we will consider the use of more than one CPU core cheating. +This type of parallelism is mainly about reducing *latency* and achieving *scalability*, but not about improving *efficiency*. You can solve a problem ten times as big with a parallel algorithm, but it would take at least ten times as many computational resources. Although parallel hardware is becoming [ever more abundant](/hpc/complexity/hardware) and parallel algorithm design is becoming an increasingly important area, for now, we will limit ourselves to considering only a single CPU core. But there are other types of parallelism, already existing inside a CPU core, that you can use *for free*. @@ -19,9 +19,9 @@ Parallelism helps in reducing *latency*. It is important, but for now, our main Sharing computations is an art in itself, but for now, we want to learn how to use resources that we already have more efficiently. -While multi-core parallelism is "cheating", many form of parallelism exist "for free". +While multi-core parallelism is "cheating," many form of parallelism exist "for free." -Adapting algorithms for parallel hardware is important for achieving *scalability*. In the first part of this book, we will consider this technique "cheating". We only do optimizations that are truly free, and preferably don't take away resources from other processes that might be running concurrently. +Adapting algorithms for parallel hardware is important for achieving *scalability*. In the first part of this book, we will consider this technique "cheating." We only do optimizations that are truly free, and preferably don't take away resources from other processes that might be running concurrently. --> @@ -42,16 +42,16 @@ Pipelining does not reduce *actual* latency but functionally makes it seem like Having this in mind, hardware manufacturers prefer to use *cycles per instruction* (CPI) instead of something like "average instruction latency" as the main performance indicator for CPU designs. It is a [pretty good metric](/hpc/profiling/benchmarking) for algorithm designs too, if we only consider *useful* instructions. -CPI of a perfectly pipelined processor should tend to one, but it can actually be even lower if we make each stage of the pipeline "wider" by duplicating it, so that more than one instruction can be processed at a time. Because the cache and most of the ALU can be shared, this ends up being cheaper than adding a fully separate core. Such architectures, capable of executing more than one instruction per cycle, are called *superscalar*, and most modern CPUs are. +The CPI of a perfectly pipelined processor should tend to one, but it can actually be even lower if we make each stage of the pipeline "wider" by duplicating it, so that more than one instruction can be processed at a time. Because the cache and most of the ALU can be shared, this ends up being cheaper than adding a fully separate core. Such architectures, capable of executing more than one instruction per cycle, are called *superscalar*, and most modern CPUs are. -You can only take advantage of superscalar processing if the stream of instructions contains groups of logically independent operations that can be processed separately. The instructions don't always arrive in the most convenient order, so, when possible, modern CPUs can execute them *out-of-order* to improve overall utilization and minimize pipeline stalls. How this magic works is a topic for [a more advanced discussion](scheduling), but for now, you can assume that the CPU maintains a buffer of pending instructions up to some distance in the future, and executes them as soon as the values of its operands are computed and there is an execution unit available. +You can only take advantage of superscalar processing if the stream of instructions contains groups of logically independent operations that can be processed separately. The instructions don't always arrive in the most convenient order, so, when possible, modern CPUs can execute them *out of order* to improve overall utilization and minimize pipeline stalls. How this magic works is a topic for a more advanced discussion, but for now, you can assume that the CPU maintains a buffer of pending instructions up to some distance in the future, and executes them as soon as the values of its operands are computed and there is an execution unit available. ### An Education Analogy Consider how our education system works: 1. Topics are taught to groups of students instead of individuals as broadcasting the same things to everyone at once is more efficient. -2. An intake of students is split into groups lead by different teachers; assignments and other course materials are shared between groups. +2. An intake of students is split into groups led by different teachers; assignments and other course materials are shared between groups. 3. Each year the same course is taught to a new intake so that the teachers are kept busy. These innovations greatly increase the *throughput* of the whole system, although the *latency* (time to graduation for a particular student) remains unchanged (and maybe increases a little bit because personalized tutoring is more effective). @@ -62,7 +62,7 @@ You can find many analogies with modern CPUs: 2. There are multiple execution units that can process these instructions simultaneously while sharing other CPU facilities (usually 2-4 execution units). 3. Instructions are processed in pipelined fashion (saving roughly the same number of cycles as the number of years between kindergarten and PhD). - + In addition to that, several other aspects also match: diff --git a/content/english/hpc/pipelining/branching.md b/content/english/hpc/pipelining/branching.md index db008023..08d7887d 100644 --- a/content/english/hpc/pipelining/branching.md +++ b/content/english/hpc/pipelining/branching.md @@ -1,9 +1,10 @@ --- title: The Cost of Branching weight: 2 +published: true --- -When a CPU encounters a conditional jump or [any other type of branching](/hpc/architecture/indirect), it doesn't just sit idle until its condition is computed — instead it starts *speculatively executing* the branch that seems more likely to be taken immediately. During execution the CPU computes statistics about branches taken on each instruction, and after a while and they start to predict them by recognizing common patterns. +When a CPU encounters a conditional jump or [any other type of branching](/hpc/architecture/indirect), it doesn't just sit idle until its condition is computed — instead, it starts *speculatively executing* the branch that seems more likely to be taken immediately. During execution, the CPU computes statistics about branches taken on each instruction, and after some time, they start to predict them by recognizing common patterns. For this reason, the true "cost" of a branch largely depends on how well it can be predicted by the CPU. If it is a pure 50/50 coin toss, you have to suffer a [control hazard](../hazards) and discard the entire pipeline, taking another 15-20 cycles to build up again. And if the branch is always or never taken, you pay almost nothing except checking the condition. @@ -26,7 +27,7 @@ for (int i = 0; i < N; i++) s += a[i]; ``` -We set $N = 10^6$ and run this loop many times over so that cold cache effects doesn't mess up our results. We mark our accumulator variable as `volatile` so that the compiler doesn't vectorize the loop, interleave its iterations, or "cheat" in any other way. +We set $N = 10^6$ and run this loop many times over so that the [cold cache](/hpc/cpu-cache/bandwidth) effect doesn't mess up our results. We mark our accumulator variable as `volatile` so that the compiler doesn't vectorize the loop, interleave its iterations, or "cheat" in any other way. On Clang, this produces assembly that looks like this: @@ -44,17 +45,17 @@ body: jmp counter ``` -Our goal is to simulate a completely unpredictable branch, and we successfully achieve it: the code takes ~14 CPU cycles per element. For a very rough estimate of what it is supposed to be, we can assume that the branches alternate between "<" and ">=", and the pipeline is mispredicted every other iteration. Then, every two iterations: +Our goal is to simulate a completely unpredictable branch, and we successfully achieve it: the code takes ~14 CPU cycles per element. For a very rough estimate of what it is supposed to be, we can assume that the branches alternate between `<` and `>=`, and the pipeline is mispredicted every other iteration. Then, every two iterations: -- We discard the pipeline, which is 19 cycles deep on Zen 2 (i. e. it has 19 stages, each taking one cycle). +- We discard the pipeline, which is 19 cycles deep on Zen 2 (i.e., it has 19 stages, each taking one cycle). - We need a memory fetch and a comparison, which costs ~5 cycles. We can check the conditions of even and odd iterations concurrently, so let's assume we only pay it once per 2 iterations. -- In case of the "<" branch, we need another ~4 cycles to add `a[i]` to a volatile (memory-stored) variable `s`. +- In the case of the `<` branch, we need another ~4 cycles to add `a[i]` to a volatile (memory-stored) variable `s`. Therefore, on average, we need to spend $(4 + 5 + 19) / 2 = 14$ cycles per element, matching what we measured. ### Branch Prediction -We can replace the hardcoded 50% with a tweakable parameter `P`, which effectively corresponds to the probability of the "<" branch: +We can replace the hardcoded `50` with a tweakable parameter `P` that effectively sets the probability of the `<` branch: ```c++ for (int i = 0; i < N; i++) @@ -66,15 +67,15 @@ Now, if we benchmark it for different values of `P`, we get an interesting-looki ![](../img/probabilities.svg) -It's peak is at 50-55%, as expected: branch misprediction is the most expensive thing here. This graph is asymmetrical: it takes just ~1 cycle to only check conditions that are never satisfied (`P = 0`), and ~7 cycles for the sum if the branch is always taken (`P = 7`). +Its peak is at 50-55%, as expected: branch misprediction is the most expensive thing here. This graph is asymmetrical: it takes just ~1 cycle to only check conditions that are never satisfied (`P = 0`), and ~7 cycles for the sum if the branch is always taken (`P = 100`). -An interesting detail is that this graph is not unimodal: there is another local minimum at around 85-90%. We spend ~6.15 cycles per element there, or about 10-15% faster compared to when we always take the branch, accounting for the fact that we need to perform less additions. Branch misprediction stop affecting performance at this point, because it happens, not the whole instruction buffer is discarded, but only the operations that were speculatively scheduled. That 10-15% mispredict rate is the equilibrium point where we can see far enough in the pipeline not to stall, but save 10-15% on taking the cheaper ">=" branch. +This graph is not unimodal: there is another local minimum at around 85-90%. We spend ~6.15 cycles per element there or about 10-15% faster than when we always take the branch, accounting for the fact that we need to perform fewer additions. Branch misprediction stops affecting the performance at this point because when it happens, not the whole instruction buffer is discarded, but only the operations that were speculatively scheduled. Essentially, that 10-15% mispredict rate is the equilibrium point where we can see far enough in the pipeline not to stall but still save 10-15% on taking the cheaper `>=` branch. Note that it costs almost nothing to check for a condition that never or almost never occurs. This is why programmers use runtime exceptions and base case checks so profusely: if they are indeed rare, they don't really cost anything. ### Pattern Detection -Here, everything that was needed of a branch prediction is a hardware statistics counter: if we went to branch A more often than to branch B, then it makes sense to speculatively execute branch A. But branch predictors on modern CPUs are considerably more advanced than that and can detect much more complicated patterns. +In our example, everything that was needed for efficient branch prediction is a hardware statistics counter. If we historically took branch A more often than branch B, then it makes sense to speculatively execute branch A. But branch predictors on modern CPUs are considerably more advanced than that and can detect much more complicated patterns. Let's fix `P` back at 50, and then sort the array first before the main summation loop: @@ -85,9 +86,9 @@ for (int i = 0; i < N; i++) std::sort(a, a + n); ``` -We are still processing the same elements, but in different order, and instead of 14 cycles, it now runs in a little bit more than 4, which is exactly the average of the cost of the pure "<" and ">=" branches. +We are still processing the same elements, but in a different order, and instead of 14 cycles, it now runs in a little bit more than 4, which is exactly the average of the cost of the pure `<` and `>=` branches. -The branch predictor can pick up on much more complicated patterns than just "always left, then always right" or "left-right-left-right". If we just decrease the size of the array $N$ to 1000 (without sorting it), then branch predictor memorizes the entire sequence of comparisons, and the benchmark again measures at around 4 — in fact, even slightly less than in the sorted array, because in the former case branch predictor needs to spend some time flicking between the "always yes" and "always no" states. +The branch predictor can pick up on much more complicated patterns than just "always left, then always right" or "left-right-left-right." If we just decrease the size of the array $N$ to 1000 (without sorting it), then the branch predictor memorizes the entire sequence of comparisons, and the benchmark again measures at around 4 cycles — in fact, even slightly fewer than in the sorted array case, because in the former case branch predictor needs to spend some time flicking between the "always yes" and "always no" states. ### Hinting Likeliness of Branches diff --git a/content/english/hpc/pipelining/branchless.md b/content/english/hpc/pipelining/branchless.md index e356d5a2..31bd5a39 100644 --- a/content/english/hpc/pipelining/branchless.md +++ b/content/english/hpc/pipelining/branchless.md @@ -1,9 +1,10 @@ --- title: Branchless Programming weight: 3 +published: true --- -As we established in [the pervious section](../branching), branches that can't be effectively predicted by the CPU are expensive as they may cause a long pipeline stall to fetch new instructions after a branch mispredict. In this section, we discuss the means of removing branches in the first place. +As we established in [the previous section](../branching), branches that can't be effectively predicted by the CPU are expensive as they may cause a long pipeline stall to fetch new instructions after a branch mispredict. In this section, we discuss the means of removing branches in the first place. ### Predication @@ -27,30 +28,32 @@ for (int i = 0; i < N; i++) s += (a[i] < 50) * a[i]; ``` -Suddenly, the loop now takes ~7 cycles per element, instead of the original ~14. Also, the performance remains constant if we change `50` to some other threshold, so it doesn't depend on the branch probability. +The loop now takes ~7 cycles per element instead of the original ~14. Also, the performance remains constant if we change `50` to some other threshold, so it doesn't depend on the branch probability. But wait… shouldn't there still be a branch? How does `(a[i] < 50)` map to assembly? -There are no boolean types in assembly, nor any instructions that yield either one or zero based on the result of the comparison, but we can compute it indirectly like this: `(a[i] - 50) >> 31`. This trick relies on the [binary representation of integers](/hpc/arithmetic/integer), specifically on the fact that if the expression `a[i] - 50` is negative (implying `a[i] < 50`), then the highest bit of the result will be set to one, which we can then extract using a right-shift. +There are no Boolean types in assembly, nor any instructions that yield either one or zero based on the result of the comparison, but we can compute it indirectly like this: `(a[i] - 50) >> 31`. This trick relies on the [binary representation of integers](/hpc/arithmetic/integer), specifically on the fact that if the expression `a[i] - 50` is negative (implying `a[i] < 50`), then the highest bit of the result will be set to one, which we can then extract using a right-shift. ```nasm mov ebx, eax ; t = x sub ebx, 50 ; t -= 50 sar ebx, 31 ; t >>= 31 -mul eax, ebx ; x *= t +imul eax, ebx ; x *= t ``` -Another, more complicated way to implement this whole sequence is to convert this sign byte into a mask and then use bitwise `and` instead of multiplication: `((a[i] - 50) >> 1 - 1) & a`. This makes the whole sequence one cycle faster, considering that unlike other instructions, `mul` takes 3 cycles: +Another, more complicated way to implement this whole sequence is to convert this sign bit into a mask and then use bitwise `and` instead of multiplication: `((a[i] - 50) >> 31 - 1) & a[i]`. This makes the whole sequence one cycle faster, considering that, unlike other instructions, `imul` takes 3 cycles: ```nasm mov ebx, eax ; t = x sub ebx, 50 ; t -= 50 sar ebx, 31 ; t >>= 31 -; mul eax, ebx ; x *= t +; imul eax, ebx ; x *= t sub ebx, 1 ; t -= 1 (causing underflow if t = 0) and eax, ebx ; x &= t ``` +Note that this optimization is not technically correct from the compiler's perspective: for the 50 lowest representable integers — those in the $[-2^{31}, - 2^{31} + 49]$ range — the result will be wrong due to underflow. We know that all numbers are all between 0 and 100, and this won't happen, but the compiler doesn't. + But the compiler actually elects to do something different. Instead of going with this arithmetic trick, it used a special `cmov` ("conditional move") instruction that assigns a value based on a condition (which is computed and checked using the flags register, the same way as for jumps): ```nasm @@ -63,7 +66,7 @@ So the code above is actually closer to using a ternary operator like this: ```c++ for (int i = 0; i < N; i++) - s += (a[i] < 50 : a[i] : 0); + s += (a[i] < 50 ? a[i] : 0); ``` Both variants are optimized by the compiler and produce the following assembly: @@ -86,11 +89,11 @@ $$ x = c \cdot a + (1 - c) \cdot b $$ -This way you can eliminate branching, but this comes at the cost of evaluating *both* branches and the `cmov` itself. Because evaluating the ">=" branch costs nothing, the performance is exactly equal to [the "always yes" case](branching/#branch-prediction) in the branchy version. +This way you can eliminate branching, but this comes at the cost of evaluating *both* branches and the `cmov` itself. Because evaluating the ">=" branch costs nothing, the performance is exactly equal to [the "always yes" case](../branching/#branch-prediction) in the branchy version. -### When It Is Beneficial +### When Predication Is Beneficial -Using predication eliminates [a structural hazard](../hazard), but introduces a data hazard. These is still a pipeline stall, but it is a cheaper one: you only need to wait for `cmov` to be resolved, and not flush the entire pipeline in case of a mispredict. +Using predication eliminates [a control hazard](../hazards) but introduces a data hazard. There is still a pipeline stall, but it is a cheaper one: you only need to wait for `cmov` to be resolved and not flush the entire pipeline in case of a mispredict. However, there are many situations when it is more efficient to leave branchy code as it is. This is the case when the cost of computing *both* branches instead of just *one* outweighs the penalty for the potential branch mispredictions. @@ -98,13 +101,13 @@ In our example, the branchy code wins when the branch can be predicted with a pr ![](../img/branchy-vs-branchless.svg) -This 75% threshold is commonly used by the compilers as a heuristic for determining whether to use the `cmov` or not. Unfortunately, this probability is usually unknown at the compile-time, so it needs to provided in one of several ways: +This 75% threshold is commonly used by the compilers as a heuristic for determining whether to use the `cmov` or not. Unfortunately, this probability is usually unknown at the compile time, so it needs to be provided in one of several ways: -- We can use [profile-guided optimization](/hpc/compilation/pgo) which will decide for itself whether to use predication or not. -- We can use [compiler-specific intrinsics](/hpc/compilation/situational) to hint the likeliness of branches: `__builtin_expect_with_probability` in GCC and `__builtin_unpredictable` in Clang. +- We can use [profile-guided optimization](/hpc/compilation/situational/#profile-guided-optimization) which will decide for itself whether to use predication or not. +- We can use [likeliness attributes](../branching#hinting-likeliness-of-branches) and [compiler-specific intrinsics](/hpc/compilation/situational) to hint at the likeliness of branches: `__builtin_expect_with_probability` in GCC and `__builtin_unpredictable` in Clang. - We can rewrite branchy code using the ternary operator or various arithmetic tricks, which acts as sort of an implicit contract between programmers and compilers: if the programmer wrote the code this way, then it was probably meant to be branchless. -The "right way" is to use branching hints, but unfortunately, the support for them is lacking. Right now [these hints seem to be lost](https://bugs.llvm.org/show_bug.cgi?id=40027) by the time the compiler back-end decides whether a `cmov` is more beneficial. Currently, there is no good way of forcing the compiler to generate branch-free code, so sometimes the best hope is to just write a small snippet in assembly. +The "right way" is to use branching hints, but unfortunately, the support for them is lacking. Right now [these hints seem to be lost](https://bugs.llvm.org/show_bug.cgi?id=40027) by the time the compiler back-end decides whether a `cmov` is more beneficial. There is [some progress](https://discourse.llvm.org/t/rfc-cmov-vs-branch-optimization/6040) towards making it possible, but currently, there is no good way of forcing the compiler to generate branch-free code, so sometimes the best hope is to just write a small snippet in assembly. -**Data-parallel programming.** Branchless programming is very important for [SIMD](/hpc/simd) applications, including GPU programming, because they don't have branching in the first place. +**Data-parallel programming.** Branchless programming is very important for [SIMD](/hpc/simd) applications because they don't have branching in the first place. -In our array sum example, if you remove the `volatile` type qualifier from the accumulator, the compiler becomes able to [vectorize](/hpc/simd/autovectorization) the loop: +In our array sum example, removing the `volatile` type qualifier from the accumulator allows the compiler to [vectorize](/hpc/simd/auto-vectorization) the loop: ```c++ /* volatile */ int s = 0; @@ -227,7 +230,7 @@ for (int i = 0; i < N; i++) It now works in ~0.3 per element, which is mainly [bottlenecked by the memory](/hpc/cpu-cache/bandwidth). -The compiler is usually able to vectorize any loop that doesn't have branches or dependencies between the iterations — and some specific deviations from that, such as [reductions](/hpc/simd/reduction) or simple loops that contain just one if-without-else. Vectorization of anything more complex is a very nontrivial problem, which may involve various techniques such as [masking](/hpc/simd/masking) and [in-register permutations](/hpc/simd/permutation). +The compiler is usually able to vectorize any loop that doesn't have branches or dependencies between the iterations — and some specific small deviations from that, such as [reductions](/hpc/simd/reduction) or simple loops that contain just one if-without-else. Vectorization of anything more complex is a very nontrivial problem, which may involve various techniques such as [masking](/hpc/simd/masking) and [in-register permutations](/hpc/simd/shuffling). -Interleaving the stages of execution is a general idea in digital electronics, and it is applied not only in the main CPU pipeline, but also on the level of separate instructions and [memory](/hpc/cpu-cache/mlp). Most execution units have their own little pipelines, and can take another instruction just one or two cycles after the previous one. If a certain instruction is frequently used, it makes sense to duplicate its execution unit also, and also place frequently jointly used instructions on the same execution unit: e. g. not using the same for arithmetic and memory operation. +Interleaving the stages of execution is a general idea in digital electronics, and it is applied not only in the main CPU pipeline, but also on the level of separate instructions and [memory](/hpc/cpu-cache/mlp). Most execution units have their own little pipelines, and can take another instruction just one or two cycles after the previous one. If a certain instruction is frequently used, it makes sense to duplicate its execution unit also, and also place frequently jointly used instructions on the same execution unit: e.g., not using the same for arithmetic and memory operation. ### Microcode @@ -22,9 +22,9 @@ While complex instruction sets had the benefit, with superscalar processors you Instructions are microcoded. -uOps ("micro-ops", the first letter is meant to be greek letter mu as in us (microsecond), but nobody cares enough to type it). +uOps ("micro-ops," the first letter is meant to be greek letter mu as in us (microsecond), but nobody cares enough to type it). -Each architecture has its own set of "ports", each capable of executing its own set of instructions (uOps, to be more exact). +Each architecture has its own set of "ports," each capable of executing its own set of instructions (uOps, to be more exact). But still, when you use it, it appears and feels like a single instruction. How does CPU achieve that? diff --git a/content/english/hpc/pipelining/tables.md b/content/english/hpc/pipelining/tables.md index d18d99c6..ad90c400 100644 --- a/content/english/hpc/pipelining/tables.md +++ b/content/english/hpc/pipelining/tables.md @@ -14,7 +14,7 @@ In this context, it makes sense to use two different "[costs](/hpc/complexity)" -You can get latency and throughput numbers for a specific architecture from special documents called [instruction tables](https://www.agner.org/optimize/instruction_tables.pdf). Here are some samples values for my Zen 2 (all specified for 32-bit operands, if there is any difference): +You can get latency and throughput numbers for a specific architecture from special documents called [instruction tables](https://www.agner.org/optimize/instruction_tables.pdf). Here are some sample values for my Zen 2 (all specified for 32-bit operands, if there is any difference): | Instruction | Latency | RThroughput | |-------------|---------|:------------| @@ -30,11 +30,11 @@ You can get latency and throughput numbers for a specific architecture from spec Some comments: -- Because our minds are so used to the cost model where "more" means "worse", people mostly use *reciprocals* of throughput instead of throughput. +- Because our minds are so used to the cost model where "more" means "worse," people mostly use *reciprocals* of throughput instead of throughput. - If a certain instruction is especially frequent, its execution unit could be duplicated to increase its throughput — possibly to even more than one, but not higher than the [decode width](/hpc/architecture/layout). - Some instructions have a latency of 0. This means that these instruction are used to control the scheduler and don't reach the execution stage. They still have non-zero reciprocal throughput because the [CPU front-end](/hpc/architecture/layout) still needs to process them. -- Most instructions are pipelined, and if they have the reciprocal throughput of $n$, this usually means that their execution unit can take another instruction after $n$ cycles (and if it is below 1, this means that there are multiple execution units, all capable of taking another instruction on the next cycle). One notable exception is the [integer division](/hpc/arithmetic/division): it is either very poorly pipelined or not pipelined at all. -- Some instructions have variable latency, depending on not only the size, but also the values of the operands. For memory operations (including fused ones like `add`), latency is usually specified for the best case (an L1 cache hit). +- Most instructions are pipelined, and if they have the reciprocal throughput of $n$, this usually means that their execution unit can take another instruction after $n$ cycles (and if it is below 1, this means that there are multiple execution units, all capable of taking another instruction on the next cycle). One notable exception is [integer division](/hpc/arithmetic/division): it is either very poorly pipelined or not pipelined at all. +- Some instructions have variable latency, depending on not only the size, but also the values of the operands. For memory operations (including fused ones like `add`), the latency is usually specified for the best case (an L1 cache hit). There are many more important little details, but this mental model will suffice for now. diff --git a/content/english/hpc/pipelining/throughput.md b/content/english/hpc/pipelining/throughput.md index 28a795d2..0b596404 100644 --- a/content/english/hpc/pipelining/throughput.md +++ b/content/english/hpc/pipelining/throughput.md @@ -6,7 +6,7 @@ weight: 4 Optimizing for *latency* is usually quite different from optimizing for *throughput*: - When optimizing data structure queries or small one-time or branchy algorithms, you need to [look up the latencies](../tables) of its instructions, mentally construct the execution graph of the computation, and then try to reorganize it so that the critical path is shorter. -- When optimizing hot loops and large-dataset algorithms, you need to look up the throughputs of its instructions, count how many times each one is used per iteration, determine which of them is the bottleneck, and then try to restructure the loop so that it is used less often. +- When optimizing hot loops and large-dataset algorithms, you need to look up the throughputs of their instructions, count how many times each one is used per iteration, determine which of them is the bottleneck, and then try to restructure the loop so that it is used less often. The last advice only works for *data-parallel* loops, where each iteration is fully independent of the previous one. When there is some interdependency between consecutive iterations, there may potentially be a pipeline stall caused by a [data hazard](../hazards) as the next iteration is waiting for the previous one to complete. @@ -21,7 +21,7 @@ for (int i = 0; i < n; i++) s += a[i]; ``` -Let's assume for a moment that the compiler doesn't [vectorize](/hpc/simd) this loop, [the memory bandwidth](/hpc/memory/bandwidth) isn't a concern, and that the loop is [unrolled](/hpc/architecture/loops) so that we don't pay any additional cost associated with maintaining the loop variables. In this case, the computation becomes very simple: +Let's assume for a moment that the compiler doesn't [vectorize](/hpc/simd) this loop, [the memory bandwidth](/hpc/cpu-cache/bandwidth) isn't a concern, and that the loop is [unrolled](/hpc/architecture/loops) so that we don't pay any additional cost associated with maintaining the loop variables. In this case, the computation becomes very simple: ```c++ int s = 0; @@ -41,7 +41,7 @@ But we can go higher than that. The *throughput* of `add`[^throughput] is 2 on m The solution is to use *two* accumulators and just sum up odd and and even elements separately: ```c++ -int s0 = 0, s1 = 1; +int s0 = 0, s1 = 0; s0 += a[0]; s1 += a[1]; s0 += a[2]; @@ -64,7 +64,7 @@ If an instruction has a latency of $x$ and a throughput of $y$, then you would n This technique is mostly used with [SIMD](/hpc/simd) and not in scalar code. You can [generalize](/hpc/simd/reduction) the code above and compute sums and other reductions faster than the compiler. -In general, when optimizing loops, you usually have just one or a few *execution ports* that you want to utilize to their fullest, and you engineer the rest of the loop around them. As different instructions may use different sets of ports, it is not always clear which one is going to be the overused. In situations like this, [machine code analyzers](/hpc/profiling/mca) can be very helpful for finding bottlenecks of small assembly loops. +In general, when optimizing loops, you usually have just one or a few *execution ports* that you want to utilize to their fullest, and you engineer the rest of the loop around them. As different instructions may use different sets of ports, it is not always clear which one is going to be overused. In situations like this, [machine code analyzers](/hpc/profiling/mca) can be very helpful for finding the bottlenecks of small assembly loops. + +Splitting up source files allows you to speed up compilation using a caching build system such as [Make](https://en.wikipedia.org/wiki/Make_(software)). + +I usually carry a version of this Makefile across my projects: + +```c++ +compile = g++ -std=c++17 -O3 -march=native -Wall + +%: %.cc gcd.hh + $(compile) $< -o $@ + +%.s: %.cc gcd.hh + $(compile) -S -fverbose-asm $< -o $@ + +%.run: % + @./$< + +.PHONY: %.run +``` + +You can now compile `example.cc` with `make example`, and automatically run it with `make example.run`. + +You can also add scripts for calculating statistics in the Makefile, or incorporate it with `perf stat` calls to make profiling automatic. + +### Jupyter Notebooks + +To speed up high-level analytics, you can create a Jupyter notebook where you put all your scripts and do all the plots. + +It is convenient to add a wrapper for benchmarking an implementation, which just returns a scalar result: + +```python +def bench(source, n=2**20): + !make -s {source} + if _exit_code != 0: + raise Exception("Compilation failed") + res = !./{source} {n} {q} + duration = float(res[0].split()[0]) + return duration +``` + +Then you can use it to write clean analytics code: + +```python +ns = list(int(1.17**k) for k in range(30, 60)) +baseline = [bench('std_lower_bound', n=n) for n in ns] +results = [bench('my_binary_search', n=n) for n in ns] + +# plotting relative speedup for different array sizes +import matplotlib.pyplot as plt + +plt.plot(ns, [x / y for x, y in zip(baseline, results)]) +plt.show() +``` + +Once established, this workflow makes you iterate much faster and focus on optimizing the algorithm itself. diff --git a/content/english/hpc/profiling/events.md b/content/english/hpc/profiling/events.md index c531ed28..eb2ba613 100644 --- a/content/english/hpc/profiling/events.md +++ b/content/english/hpc/profiling/events.md @@ -3,9 +3,15 @@ title: Statistical Profiling weight: 2 --- -Another, less invasive approach to profiling is to interrupt the execution of a program at random intervals and look where the instruction pointer is. The number of times the pointer stopped in each function's block would be roughly proportional to the total time spent executing these functions. You can also get some other useful information this way, like finding out which functions are called by which functions by inspecting the call stack. +[Instrumentation](../instrumentation) is a rather tedious way of doing profiling, especially if you are interested in multiple small sections of the program. And even if it can be partially automated by the tooling, it still won't help you gather some fine-grained statistics because of its inherent overhead. -This could in principle be done by just running a program with `gdb` and `ctrl+c`'ing it at random intervals, but modern CPUs and operating systems provide special utilities for this type of profiling. Hardware *performance counters* are special registers built into microprocessors that can store the counts of certain hardware-related activities. They are cheap to add on a microchip, as they are basically just binary counters with an activation wire connected to them. +Another, less invasive approach to profiling is to interrupt the execution of a program at random intervals and look where the instruction pointer is. The number of times the pointer stopped in each function's block would be roughly proportional to the total time spent executing these functions. You can also get some other useful information this way, like finding out which functions are called by which functions by inspecting [the call stack](/hpc/architecture/functions). + +This could, in principle, be done by just running a program with `gdb` and `ctrl+c`'ing it at random intervals but modern CPUs and operating systems provide special utilities for this type of profiling. + +### Hardware Events + +Hardware *performance counters* are special registers built into microprocessors that can store the counts of certain hardware-related activities. They are cheap to add on a microchip, as they are basically just binary counters with an activation wire connected to them. Each performance counter is connected to a large subset of circuitry and can be configured to be incremented on a particular hardware event, such as a branch mispredict or a cache miss. You can reset a counter at the start of a program, run it, and output its stored value at the end, and it will be equal to the exact number of times a certain event has been triggered throughout the execution. @@ -15,9 +21,9 @@ Overall, event-driven statistical profiling is usually the most effective and ea ### Profiling with perf -There are many profilers and other performance analysis tools. The one we will mostly rely on in this book is [perf](https://perf.wiki.kernel.org/), which is a statistical profiler available in the Linux kernel. On non-Linux systems, you can use [VTune](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/vtune-profiler.html#gs.cuc0ks) from Intel, which provides roughly the same functionality for our purposes. It is available for free, although it is a proprietary software for which you need to refresh a community license every 90 days, while perf is free as in freedom. +Performance analysis tools that rely on the event sampling techniques described above are called *statistical profilers*. There are many of them, but the one we will mainly use in this book is [perf](https://perf.wiki.kernel.org/), which is a statistical profiler shipped with the Linux kernel. On non-Linux systems, you can use [VTune](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/vtune-profiler.html#gs.cuc0ks) from Intel, which provides roughly the same functionality for our purposes. It is available for free, although it is proprietary, and you need to refresh your community license every 90 days, while perf is free as in freedom. -Perf is a command-line application that generates reports based on live execution of programs. It does not need the source and can profile a very wide range of applications, even those that involve multiple processes and interaction with the operating system. +Perf is a command-line application that generates reports based on the live execution of programs. It does not need the source and can profile a very wide range of applications, even those that involve multiple processes and interaction with the operating system. For explanation purposes, I have written a small program that creates an array of a million random integers, sorts it, and then does a million binary searches on it: @@ -38,7 +44,7 @@ int query() { } ``` -After compiling it (`g++ -O3 -march=native example.cc -o run`), we can run it with `perf stat ./run`, which outputs the counts of basic performance events during the execution: +After compiling it (`g++ -O3 -march=native example.cc -o run`), we can run it with `perf stat ./run`, which outputs the counts of basic performance events during its execution: ```yaml Performance counter stats for './run': @@ -60,7 +66,7 @@ After compiling it (`g++ -O3 -march=native example.cc -o run`), we can run it wi 0.000000000 seconds sys ``` -You can see that the execution took 0.53 seconds, or 852M cycles at effective 1.32 GHz clock rate, over which 479M instructions were executed. There were also a total of 122.7M branches, and 15.7% of them were mispredicted. +You can see that the execution took 0.53 seconds or 852M cycles at an effective 1.32 GHz clock rate, over which 479M instructions were executed. There were also 122.7M branches, and 15.7% of them were mispredicted. You can get a list of all supported events with `perf list`, and then specify a list of specific events you want with the `-e` option. For example, for diagnosing binary search, we mostly care about cache misses: @@ -73,7 +79,7 @@ You can get a list of all supported events with `perf list`, and then specify a By itself, `perf stat` simply sets up performance counters for the whole program. It can tell you the total number of branch mispredictions, but it won't tell you *where* they are happening, let alone *why* they are happening. -To try the stop-the-world approach we talked about initially, we need to use `perf record `, which records profiling data and dumps it as a `perf.data` file, and then call `perf report` to inspect it. I highly advise you to go and try it yourselves because the last command is interactive and colorful, but for those that can't do it right now, I'll try to describe it the best I can. +To try the stop-the-world approach we discussed previously, we need to use `perf record `, which records profiling data and dumps it as a `perf.data` file, and then call `perf report` to inspect it. I highly advise you to go and try it yourselves because the last command is interactive and colorful, but for those that can't do it right now, I'll try to describe it the best I can. When you call `perf report`, it first displays a `top`-like interactive report that tells you which functions are taking how much time: @@ -87,7 +93,7 @@ Overhead Command Shared Object Symbol 0.80% run libc-2.33.so [.] rand ``` -Note that, for each function, just its *overhead* is listed and not the total running time (e. g. `setup` includes `std::__introsort_loop` but only its own overhead is accounted as 3.43%). You also need to account for possible inlining, which is apparently what happened with `std::lower_bound` here. Perf also tracks shared libraries (like `libc`) and, in general, any other spawned processes: if you want, you can launch a web browser with perf and see what's happening inside. +Note that, for each function, just its *overhead* is listed and not the total running time (e.g., `setup` includes `std::__introsort_loop` but only its own overhead is accounted as 3.43%). There are tools for constructing [flame graphs](https://www.brendangregg.com/flamegraphs.html) out of perf reports to make them more clear. You also need to account for possible inlining, which is apparently what happened with `std::lower_bound` here. Perf also tracks shared libraries (like `libc`) and, in general, any other spawned processes: if you want, you can launch a web browser with perf and see what's happening inside. Next, you can "zoom in" on any of these functions, and, among others things, it will offer to show you its disassembly with an associated heatmap. For example, here is the assembly for `query`: @@ -116,8 +122,8 @@ Next, you can "zoom in" on any of these functions, and, among others things, it │ ↑ jne 20 ``` -On the left column, you can see the fraction of times the instruction pointer stopped on a specific line. Because of intricacies such as pipelining and out-of-order execution, "now" is not a well-defined concept in modern CPUs, so the data is slightly inaccurate as the instruction pointer drifts a little bit forward. But it is still useful: here we spend ~65% of the time on the jump instruction because it has a comparison operator before it, indicating that the control flow waits there for this comparison to be decided. +On the left column is the fraction of times that the instruction pointer stopped on a specific line. You can see that we spend ~65% of the time on the jump instruction because it has a comparison operator before it, indicating that the control flow waits there for this comparison to be decided. -At the individual cycle level, we need something more precise. +Because of intricacies such as [pipelining](/hpc/pipelining) and out-of-order execution, "now" is not a well-defined concept in modern CPUs, so the data is slightly inaccurate as the instruction pointer drifts a little bit forward. The instruction-level data is still useful, but at the individual cycle level, we need to switch to [something more precise](../simulation). diff --git a/content/english/hpc/profiling/instrumentation.md b/content/english/hpc/profiling/instrumentation.md index 8ca71f99..a622e24a 100644 --- a/content/english/hpc/profiling/instrumentation.md +++ b/content/english/hpc/profiling/instrumentation.md @@ -1,8 +1,11 @@ --- title: Instrumentation weight: 1 +published: true --- + + *Instrumentation* is an overcomplicated term that means inserting timers and other tracking code into programs. The simplest example is using the `time` utility in Unix-like systems to measure the duration of execution for the whole program. More generally, we want to know *which parts* of the program need optimization. There are tools shipped with compilers and IDEs that can time designated functions automatically, but it is more robust to do it by hand using any methods of interacting with time that the language provides: @@ -14,9 +17,9 @@ float seconds = float(clock() - start) / CLOCKS_PER_SEC; printf("do_something() took %.4f", seconds); ``` -One nuance here is that you can't measure the execution time of particularly quick functions this way. The `clock` function returns the current timestamp in microseconds ($10^{-6}$), and it does so by waiting to the nearest ceiled microsecond — so it basically takes up to 1000ns to complete, which is an eternity in the world of low-level optimization. +One nuance here is that you can't measure the execution time of particularly quick functions this way because the `clock` function returns the current timestamp in microseconds ($10^{-6}$) and also by itself takes up to a few hundred nanoseconds to complete. All other time-related utilities similarly have at least microsecond granularity, which is an eternity in the world of low-level optimization. -As a workaround, you can invoke the function repeatedly in a loop, time the whole thing once, and then divide the total time by the number of iterations. You also need to ensure nothing gets cached or affected by similar side effects. This is a rather tedious way of doing profiling, especially if you are interested in multiple small sections of the program. +To achieve higher precision, you can invoke the function repeatedly in a loop, time the whole thing once, and then divide the total time by the number of iterations: ```cpp #include @@ -28,28 +31,20 @@ int main() { clock_t start = clock(); for (int i = 0; i < N; i++) - clock(); + clock(); // benchmarking the clock function itself float duration = float(clock() - start) / CLOCKS_PER_SEC; - printf("%.2fns\n", 1e9 * duration / N); + printf("%.2fns per iteration\n", 1e9 * duration / N); return 0; } ``` - +You also need to ensure that nothing gets cached, optimized away by the compiler, or affected by similar side effects. This is a separate and highly complicated topic that we will discuss in more detail at [the end of the chapter](../benchmarking). ### Event Sampling -Instrumentation can also be used for collecting other types of info that can give useful insights about the performance of a particular algorithm. For example: +Instrumentation can also be used to collect other types of information that can give useful insights about the performance of a particular algorithm. For example: - for a hash function, we are interested in the average length of its input; - for a binary tree, we care about its size and height; @@ -85,4 +80,4 @@ void query() { This way we can remove the need to sample a new random number on each invocation, only resetting the counter when we choose to calculate statistics. -Techniques like that are frequently by library algorithm developers inside large projects to collect profiling data without affecting the performance of the end program too much. +Techniques like that are frequently used by library algorithm developers inside large projects to collect profiling data without affecting the performance of the end program too much. diff --git a/content/english/hpc/profiling/mca.md b/content/english/hpc/profiling/mca.md index 8f89fe54..99cfe2ed 100644 --- a/content/english/hpc/profiling/mca.md +++ b/content/english/hpc/profiling/mca.md @@ -3,13 +3,13 @@ title: Machine Code Analyzers weight: 4 --- -The second category is *machine code analyzers*. These are programs that take assembly code and simulate its execution on a particular microarchitecture using information available to compilers, and output the latency and throughput of the whole snippet, as well as cycle-perfect utilization of various resources in a CPU. +A *machine code analyzer* is a program that takes a small snippet of assembly code and [simulates](../simulation) its execution on a particular microarchitecture using information available to compilers, and outputs the latency and throughput of the whole block, as well as cycle-perfect utilization of various resources within the CPU. -There are many of them, but I personally prefer `llvm-mca`, which you can probably install via a package manager together with `clang`. You can also access it through a new web-based tool called [UICA](https://uica.uops.info). +### Using `llvm-mca` -### Machine Code Analyzers +There are many different machine code analyzers, but I personally prefer `llvm-mca`, which you can probably install via a package manager together with `clang`. You can also access it through a web-based tool called [UICA](https://uica.uops.info) or in the [Compiler Explorer](https://godbolt.org/) by selecting "Analysis" as the language. -What machine code analyzers do is they run a set number of iterations of a given assembly snippet and compute statistics about the resource usage of each instruction, which is useful for finding out where the bottleneck is. +What `llvm-mca` does is it runs a set number of iterations of a given assembly snippet and computes statistics about the resource usage of each instruction, which is useful for finding out where the bottleneck is. We will consider the array sum as our simple example: @@ -21,7 +21,7 @@ loop: jne loop ```` -Here is its analysis with `llvm-mca` on Skylake. You are not going to understand much, but that's fine for now. +Here is its analysis with `llvm-mca` for the Skylake microarchitecture: ```yaml Iterations: 100 @@ -37,10 +37,10 @@ Block RThroughput: 0.8 First, it outputs general information about the loop and the hardware: -- It "ran" the loop 100 times, executing 400 instructions in total in 108 cycles, which is the same as executing $\frac{400}{108} \approx 3.7$ instructions per cycle ("IPC") on average. -- The CPU is theoretically capable of executing up to 6 instructions per cycle ("dispatch width"). -- Each cycle in theory can be executed in 0.8 cycles on average ("block reciprocal throughput"). -- The "uOps" here are the micro-operations that CPU splits each instruction into (e. g. fused load-add is composed of two uOps). +- It "ran" the loop 100 times, executing 400 instructions in total in 108 cycles, which is the same as executing $\frac{400}{108} \approx 3.7$ [instructions per cycle](/hpc/complexity/hardware) on average (IPC). +- The CPU is theoretically capable of executing up to 6 instructions per cycle ([dispatch width](/hpc/architecture/layout)). +- Each cycle in theory can be executed in 0.8 cycles on average ([block reciprocal throughput](/hpc/pipelining/tables)). +- The "uOps" here are the micro-operations that the CPU splits each instruction into (e.g., fused load-add is composed of two uOps). Then it proceeds to give information about each individual instruction: @@ -60,11 +60,11 @@ Instruction Info: 1 1 0.50 jne -11 ``` -There is nothing there that there isn't in the instruction tables: +There is nothing there that there isn't in the [instruction tables](/hpc/pipelining/tables): - how many uOps each instruction is split into; -- how many cycles each instruction takes to complete ("latency"); -- how many cycles each instruction takes to complete in the amortized sense ("reciprocal throughput"), considering that several copies of it can be executed simultaneously. +- how many cycles each instruction takes to complete (latency); +- how many cycles each instruction takes to complete in the amortized sense (reciprocal throughput), considering that several copies of it can be executed simultaneously. Then it outputs probably the most important part — which instructions are executing when and where: @@ -77,6 +77,12 @@ Resource pressure by instruction: - - 0.99 - - - - - 0.01 - jne -11 ``` +As the contention for execution ports causes [structural hazards](/hpc/pipelining/hazards), ports often become the bottleneck for throughput-oriented loops, and this chart helps diagnose why. It does not give you a cycle-perfect Gantt chart of something like that, but it gives you the aggregate statistics of the execution ports used for each instruction, which lets you find which one is overloaded. + + diff --git a/content/english/hpc/profiling/noise.md b/content/english/hpc/profiling/noise.md new file mode 100644 index 00000000..b1b186ae --- /dev/null +++ b/content/english/hpc/profiling/noise.md @@ -0,0 +1,147 @@ +--- +title: Getting Accurate Results +weight: 10 +published: true +--- + +It is not an uncommon for there to be two library algorithm implementations, each maintaining its own benchmarking code, and each claiming to be faster than the other. This confuses everyone involved, especially the users, who have to somehow choose between the two. + +Situations like these are usually not caused by fraudulent actions by their authors; they just have different definitions of what "faster" means, and indeed, defining and using just one performance metric is often very problematic. + +### Measuring the Right Thing + +There are many things that can introduce bias into benchmarks. + +**Differing datasets.** There are many algorithms whose performance somehow depends on the dataset distribution. In order to define, for example, what the fastest sorting, shortest path, or binary search algorithms are, you have to fix the dataset on which the algorithm is run. + +This sometimes applies even to algorithms that process a single piece of input. For example, it is not a good idea to feed GCD implementations sequential numbers because it makes branches very predictable: + +```c++ +// don't do this +int checksum = 0; + +for (int a = 0; a < 1000; a++) + for (int b = 0; b < 1000; b++) + checksum ^= gcd(a, b); +``` + +However, if we sample these same numbers randomly, branch prediction becomes much harder, and the benchmark takes longer time, despite processing the same input, but in altered order: + +```c++ +int a[1000], b[1000]; + +for (int i = 0; i < 1000; i++) + a[i] = rand() % 1000, b[i] = rand() % 1000; + +int checksum = 0; + +for (int t = 0; t < 1000; t++) + for (int i = 0; i < 1000; i++) + checksum += gcd(a[i], b[i]); +``` + + +Although the most logical choices for most cases is to just sample data uniformly at random, many real-world applications have distributions that are far from uniform, so you can't pick just one. In general, a good benchmark should be application-specific, and use the dataset that is as representing of your real use case as possible. + + + +**Multiple objectives.** Some algorithm design problems have more than one key objective. For example, hash tables, in addition to being highly dependant on the distribution of keys, also need to carefully balance: + +- memory usage, +- latency of add query, +- latency of positive membership query, +- latency of negative membership query. + +The only way to choose between hash table implementations is to try and put multiple variants into the application. + +**Latency vs Throughput.** Another aspect that people often overlook is that the execution time can be defined in more than one way, even for a single query. + +When you write code like this: + +```c++ +for (int i = 0; i < N; i++) + q[i] = rand(); + +int checksum = 0; + +for (int i = 0; i < N; i++) + checksum ^= lower_bound(q[i]); +``` + +and then time the whole thing and divide it by the number of iterations, you are actually measuring the *throughput* of the query — how many operations it can process per unit of time. This is usually less than the time it actually takes to process one operation separately because of interleaving. + +To measure actual *latency*, you need to introduce a dependency between the invocations: + +```c++ +for (int i = 0; i < N; i++) + checksum ^= lower_bound(checksum ^ q[i]); +``` + +It usually makes the most difference in algorithms with possible pipeline stall issues, e.g., when comparing branchy and branch-free algorithms. + +**Cold cache.** Another source of bias is the *cold cache effect*, when memory reads initially take longer time because the required data is not in cache yet. + +This is solved by making a *warm-up run* before starting measurements: + +```c++ +// warm-up run + +volatile checksum = 0; + +for (int i = 0; i < N; i++) + checksum ^= lower_bound(q[i]); + + +// actual run + +clock_t start = clock(); +checksum = 0; + +for (int i = 0; i < N; i++) + checksum ^= lower_bound(q[i]); +``` + +It is also sometimes convenient to combine the warm-up run with answer validation, if it is more complicated than just computing some sort of checksum. + +**Over-optimization.** Sometimes the benchmark is outright erroneous because the compiler just optimized the benchmarked code away. To prevent the compiler from cutting corners, you need to add checksums and either print them somewhere or add the `volatile` qualifier, which also prevents any sort of interleaving of loop iterations. + +For algorithms that only write data, you can use the `__sync_synchronize()` intrinsic to add a memory fence and prevent the compiler from accumulating updates. + +### Reducing Noise + + + +The issues we've described produce *bias* in measurements: they consistently give advantage to one algorithm over the other. There are other types of possible problems with benchmarking that result in either unpredictable skews or just completely random noise, thus increasing *variance*. + +These types of issues are caused by side effects and some sort of external noise, mostly due to noisy neighbors and CPU frequency scaling: + +- If you benchmark a compute-bound algorithm, measure its performance in cycles using `perf stat`: this way it will be independent of clock frequency, fluctuations of which is usually the main source of noise. +- Otherwise, set core frequency to what you expect it to be and make sure nothing interferes with it. On Linux you can do it with `cpupower` (e.g., `sudo cpupower frequency-set -g powersave` to put it to minimum or `sudo cpupower frequency-set -g ondemand` to enable turbo boost). I use a [convenient GNOME shell extension](https://extensions.gnome.org/extension/1082/cpufreq/) that has a separate button to do it. +- If applicable, turn hyper-threading off and attach jobs to specific cores. Make sure no other jobs are running on the system, turn off networking and try not to fiddle with the mouse. + +You can't remove noises and biases completely. Even a program's name can affect its speed: the executable's name ends up in an environment variable, environment variables end up on the call stack, and so the length of the name affects stack alignment, which can result in data accesses slowing down due to crossing cache line or memory page boundaries. + +It is important to account for the noise when guiding optimizations and especially when reporting results to someone else. Unless you are expecting a 2x kind of improvement, treat all microbenchmarks the same way as A/B testing. + +When you run a program on a laptop for under a second, a ±5% fluctuation in performance is completely normal. So, if you want to decide whether to revert or keep a potential +1% improvement, run it until you reach statistical significance, which you can determine by calculating variances and p-values. + +### Further Reading + +Interested readers can explore this comprehensive [list of experimental computer science resources](https://www.cs.huji.ac.il/w~feit/exp/related.html) by Dror Feitelson, perhaps starting with "[Producing Wrong Data Without Doing Anything Obviously Wrong](http://eecs.northwestern.edu/~robby/courses/322-2013-spring/mytkowicz-wrong-data.pdf)" by Todd Mytkowicz et al. + +You can also watch [this great talk](https://www.youtube.com/watch?v=r-TLSBdHe1A) by Emery Berger on how to do statistically sound performance evaluation. diff --git a/content/english/hpc/profiling/simulation.md b/content/english/hpc/profiling/simulation.md index a8026761..75401b8a 100644 --- a/content/english/hpc/profiling/simulation.md +++ b/content/english/hpc/profiling/simulation.md @@ -1,9 +1,114 @@ --- -title: Simulation +title: Program Simulation weight: 3 -draft: true --- +The last approach to profiling (or rather a group of them) is not to gather the data by actually running the program but to analyze what should happen by *simulating* it with specialized tools. + + + +There are many subcategories of such profilers, differing in which aspect of computation is simulated. In this article, we are going to focus on [caching](/hpc/cpu-cache) and [branch prediction](/hpc/pipelining/branching), and use [Cachegrind](https://valgrind.org/docs/manual/cg-manual.html) for that, which is a profiling-oriented part of [Valgrind](https://valgrind.org/), a well-established tool for memory leak detection and memory debugging in general. + +### Profiling with Cachegrind + +Cachegrind essentially inspects the binary for "interesting" instructions — that perform memory reads / writes and conditional / indirect jumps — and replaces them with code that simulates corresponding hardware operations using software data structures. It therefore doesn't need access to the source code and can work with already compiled programs, and can be run on any program like this: + +```bash +valgrind --tool=cachegrind --branch-sim=yes ./run +# also simulate branch prediction ^ ^ any command, not necessarily one process +``` + +It instruments all involved binaries, runs them, and outputs a summary similar to [perf stat](../events): + +``` +I refs: 483,664,426 +I1 misses: 1,858 +LLi misses: 1,788 +I1 miss rate: 0.00% +LLi miss rate: 0.00% + +D refs: 115,204,359 (88,016,970 rd + 27,187,389 wr) +D1 misses: 9,722,664 ( 9,656,463 rd + 66,201 wr) +LLd misses: 72,587 ( 8,496 rd + 64,091 wr) +D1 miss rate: 8.4% ( 11.0% + 0.2% ) +LLd miss rate: 0.1% ( 0.0% + 0.2% ) + +LL refs: 9,724,522 ( 9,658,321 rd + 66,201 wr) +LL misses: 74,375 ( 10,284 rd + 64,091 wr) +LL miss rate: 0.0% ( 0.0% + 0.2% ) + +Branches: 90,575,071 (88,569,738 cond + 2,005,333 ind) +Mispredicts: 19,922,564 (19,921,919 cond + 645 ind) +Mispred rate: 22.0% ( 22.5% + 0.0% ) +``` + +We've fed Cachegrind exactly the same example code as in [the previous section](../events): we create an array of a million random integers, sort it, and then perform a million binary searches on it. Cachegrind shows roughly the same numbers as perf does, except that that perf's measured numbers of memory reads and branches are slightly inflated due to [speculative execution](/hpc/pipelining): they really happen in hardware and thus increment hardware counters, but are discarded and don't affect actual performance, and thus ignored in the simulation. + +Cachegrind only models the first (`D1` for data, `I1` for instructions) and the last (`LL`, unified) levels of cache, the characteristics of which are inferred from the system. It doesn't limit you in any way as you can also set them from the command line, e g., to model the L2 cache: `--LL=,,`. + +It seems like it only slowed down our program so far and hasn't provided us any information that `perf stat` couldn't. To get more out of it than just the summary info, we can inspect a special file with profiling info, which it dumps by default in the same directory named as `cachegrind.out.`. It is human-readable, but is expected to be read via the `cg_annotate` command: + +```bash +cg_annotate cachegrind.out.4159404 --show=Dr,D1mr,DLmr,Bc,Bcm +# ^ we are only interested in data reads and branches +``` + +First it shows the parameters that were used during the run, including the characteristics of the cache system: + +``` +I1 cache: 32768 B, 64 B, 8-way associative +D1 cache: 32768 B, 64 B, 8-way associative +LL cache: 8388608 B, 64 B, direct-mapped +``` + +It didn't get the L3 cache quite right: it is not unified (8M in total, but a single core only sees 4M) and also 16-way associative, but we will ignore that for now. + +Next, it outputs a per-function summary similar to `perf report`: + +``` +Dr D1mr DLmr Bc Bcm file:function +-------------------------------------------------------------------------------- +19,951,476 8,985,458 3 41,902,938 11,005,530 ???:query() +24,832,125 585,982 65 24,712,356 7,689,480 ???:void std::__introsort_loop<...> +16,000,000 60 3 9,935,484 129,044 ???:random_r +18,000,000 2 1 6,000,000 1 ???:random + 4,690,248 61,999 17 5,690,241 1,081,230 ???:setup() + 2,000,000 0 0 0 0 ???:rand +``` + +You can see there are a lot of branch mispredicts in the sorting stage, and also a lot of both L1 cache misses and branch mispredicts during binary searching. We couldn't get this information with perf — it would only tell use these counts for the whole program. + +Another great feature that Cachegrind has is the line-by-line annotation of source code. For that, you need to compile the program with debug information (`-g`) and either explicitly tell `cg_annotate` which source files to annotate or just pass the `--auto=yes` option so that it annotates everything it can reach (including the standard library source code). + +The whole source-to-analysis process would therefore go like this: + +```bash +g++ -O3 -g sort-and-search.cc -o run +valgrind --tool=cachegrind --branch-sim=yes --cachegrind-out-file=cachegrind.out ./run +cg_annotate cachegrind.out --auto=yes --show=Dr,D1mr,DLmr,Bc,Bcm +``` + +Since the glibc implementations are not the most readable, for exposition purposes, we replace `lower_bound` with our own binary search, which will be annotated like this: + +```c++ +Dr D1mr DLmr Bc Bcm + . . . . . int binary_search(int x) { + 0 0 0 0 0 int l = 0, r = n - 1; + 0 0 0 20,951,468 1,031,609 while (l < r) { + 0 0 0 0 0 int m = (l + r) / 2; +19,951,468 8,991,917 63 19,951,468 9,973,904 if (a[m] >= x) + . . . . . r = m; + . . . . . else + 0 0 0 0 0 l = m + 1; + . . . . . } + . . . . . return l; + . . . . . } +``` + +Unfortunately, Cachegrind only tracks memory accesses and branches. When the bottleneck is caused by something else, we need [other simulation tools](../mca). diff --git a/content/english/hpc/simd/_index.md b/content/english/hpc/simd/_index.md index 883fb0ab..50f6e3ed 100644 --- a/content/english/hpc/simd/_index.md +++ b/content/english/hpc/simd/_index.md @@ -27,22 +27,22 @@ Now, let's add the following magic directive in the very beginning: // ...the rest is the same as before ``` -Compiled and run in the exact same environment, it now finishes in 1.24 seconds. This is almost twice as fast, and we didn't change a single line of code or the optimization level. +When compiled and run in the same environment, it finishes in 1.24 seconds. This is almost twice as fast, and we didn't change a single line of code or the optimization level. -What happened here is we provided a little bit of info about the computer on which this code is supposed to be run. Specifically, we told the compiler that the target CPU supports an extension to x86 instruction set called "AVX2". AVX2 is one of the many so-called "SIMD extensions" for x86. These extensions include instructions that operate on special registers capable of holding 128, 256, or even 512 bits of data using the "single instruction, multiple data" (SIMD) approach. Instead of working with a single scalar value, SIMD instructions divide the data in registers into blocks of 8, 16, 32, or 64 bits and perform the same operation on them in parallel, yielding a proportional increase in performance[^power]. +What happened here is we provided a little bit of info about the computer on which this code is supposed to be run. Specifically, we told the compiler that the target CPU supports an extension to the x86 instruction set called "AVX2." AVX2 is one of the many so-called "SIMD extensions" for x86. These extensions include instructions that operate on special registers capable of holding 128, 256, or even 512 bits of data using the "single instruction, multiple data" (SIMD) approach. Instead of working with a single scalar value, SIMD instructions divide the data in registers into blocks of 8, 16, 32, or 64 bits and perform the same operation on them in parallel, yielding a proportional increase in performance[^power]. -[^power]: On some CPUs, especially heavy SIMD instructions consume more energy and thus [require downclocking](https://blog.cloudflare.com/on-the-dangers-of-intels-frequency-scaling/) in order to balance off the total power consumption, so the real time speedup is not always proportional. +[^power]: On some CPUs, especially heavy SIMD instructions consume more energy and thus [require downclocking](https://blog.cloudflare.com/on-the-dangers-of-intels-frequency-scaling/) to balance off the total power consumption, so the real-time speedup is not always proportional. ![](img/simd.png) -These extensions are relatively new, and their support in CPUs has been implemented gradually while maintaining backwards compatibility[^avx512]. Apart from adding more specialized instructions, the most important difference between them is the introduction of progressively wider registers. +These extensions are relatively new, and their support in CPUs has been implemented gradually while maintaining backward compatibility[^avx512]. Apart from adding more specialized instructions, the most important difference between them is the introduction of progressively wider registers. -In particular, AVX2 has instructions for working with 256-bit registers, while by default GCC assumes that nothing past the 128-bit SSE2 is enabled. Hence, after telling the optimizer that it can use instructions that add 8 integers at once instead of 4, the performance was increased twofold. +In particular, AVX2 has instructions for working with 256-bit registers, while by default, GCC assumes that nothing past the 128-bit SSE2 is enabled. Hence, after telling the optimizer that it can use instructions that add 8 integers at once instead of 4, the performance was increased twofold. -[^avx512]: Starting with AVX512, backwards compatibility is no longer maintained: there are many different "flavours" tailored to specific needs such as data compression, encryption or machine learning. +[^avx512]: Starting with AVX512, backward compatibility is no longer maintained: there are many different "flavors" tailored to specific needs such as data compression, encryption, or machine learning. ![](img/intel-extensions.webp) -Compilers often do a good job rewriting simple loops with SIMD instructions, like in the case above. This optimization is called *autovectorization*, and it is the preferred way to use SIMD. +Compilers often do a good job rewriting simple loops with SIMD instructions, like in the case above. This optimization is called [auto-vectorization](auto-vectorization), and it is the most popular way of using SIMD. -The problem is, it only works with certain types of loops, and even then it often yields suboptimal results. To understand its limitations, we need to get our hands dirty and explore this technology on a lower level, which is what we will do in this chapter. +The problem is that it only works with certain types of loops, and even then it often yields suboptimal results. To understand its limitations, we need to get our hands dirty and explore this technology on a lower level, which is what we are going to do in this chapter. diff --git a/content/english/hpc/simd/_pres.md b/content/english/hpc/simd/_pres.md deleted file mode 100644 index 3e3b11fe..00000000 --- a/content/english/hpc/simd/_pres.md +++ /dev/null @@ -1,401 +0,0 @@ ---- -title: SIMD Instructions -draft: true ---- - - -## Recall: Superscalar Processors - -* Any instruction execution takes multiple steps -* To hide latency, everything is pipelined -* You can get CPI < 1 if you have more than one of each execution unit -* Performance engineering is basically about avoiding pipeline stalls - -![](https://upload.wikimedia.org/wikipedia/commons/thumb/4/46/Superscalarpipeline.svg/2880px-Superscalarpipeline.svg.png =450x) - ---- - -## Single Instruction, Multple Data - -![](https://upload.wikimedia.org/wikipedia/commons/thumb/2/21/SIMD.svg/1200px-SIMD.svg.png =450x) - -Instructions that perform the same operation on multiple data points -(blocks of 128, 256 or 512 bits, also called *vectors*) - ----- - -![](https://i0.wp.com/www.urtech.ca/wp-content/uploads/2017/11/Intel-mmx-sse-sse2-avx-AVX-512.png =500x) - -Backwards-compatible up until AVX-512 - -(x86 specific; ARM and others have similar instruction sets) - ----- - -You can check compatibility during runtime: - -```cpp -cout << __builtin_cpu_supports("sse") << endl; -cout << __builtin_cpu_supports("sse2") << endl; -cout << __builtin_cpu_supports("avx") << endl; -cout << __builtin_cpu_supports("avx2") << endl; -cout << __builtin_cpu_supports("avx512f") << endl; -``` - -...or call `cat /proc/cpuinfo` and see CPU flags along with other info - ---- - -## How to Use SIMD - -Converting a program from scalar to vector one is called *vectorization*, -which can be achieved using a combination of: - -* x86 assembly -* **C/C++ intrinsics** -* Vector types -* SIMD libraries -* **Auto-vectorization** - -Later are simpler, former are more flexible - ----- - -### Intel Intrinsics Guide - -![](https://i.imgur.com/ZIzDidV.png =600x) - -Because nobody likes to write assembly - -https://software.intel.com/sites/landingpage/IntrinsicsGuide/ - ----- - -All C++ intrinsics can be included with `x86intrin.h` - -```cpp -#pragma GCC target("avx2") -#pragma GCC optimize("O3") - -#include -#include - -using namespace std; -``` - -You can also drop pragmas and compile with `-O3 -march=native` instead - ---- - -## The A+B Problem - -```cpp -const int n = 1e5; -int a[n], b[n], c[n]; - -for (int t = 0; t < 100000; t++) - for (int i = 0; i < n; i++) - c[i] = a[i] + b[i]; -``` - -Twice as fast (!) if you compile with AVX instruction set -(i. e. add `#pragma GCC target("avx2")` or `-march=native`) - ----- - -## What Actually Happens - -```cpp -double a[100], b[100], c[100]; - -for (int i = 0; i < 100; i += 4) { - // load two 256-bit arrays into their respective registers - __m256d x = _mm256_loadu_pd(&a[i]); - __m256d y = _mm256_loadu_pd(&b[i]); - // - 256 is the block size - // - d stands for "double" - // - pd stands for "packed double" - - // perform addition - __m256d z = _mm256_add_pd(x, y); - // write the result back into memory - _mm256_storeu_pd(&c[i], z); -} - -``` - -(I didn't come up with the op naming, don't blame me) - ----- - -### More examples - -* `_mm_add_epi16`: adds two 16-bit extended packed integers (128/16=8 short ints) -* `_mm256_acos_pd`: computes acos of 256/64=4 doubles -* `_mm256_broadcast_sd`: creates 4 copies of a number in a "normal" register -* `_mm256_ceil_pd`: rounds double up to nearest int -* `_mm256_cmpeq_epi32`: compares 8+8 packed ints and returns a (vector) mask that contains ones for elements that are equal -* `_mm256_blendv_ps`: blends elements from either one vector or another according to a mask (vectorized cmov, could be used to replace `if`) - ----- - -### Vector Types - -For some reason, C++ intrinsics have explicit typing, for example on AVX: -* `__m256` means float and only instructions ending with "ps" work -* `__m256d` means double and only instructions ending with "pd" work -* `__m256i` means different integers and only instructions ending with "epi/epu" wor - -You can freely convert between them with C-style casting - ----- - -Also, compiles have their own vector types: - -```cpp -typedef float float8_t __attribute__ (( vector_size (8 * sizeof(float)) )); -float8_t v; -float first_element = v[0]; // you can index them as arrays -float8_t v_squared = v * v; // you can use a subset of normal C operations -float8_t v_doubled = _mm256_movemask_ps(v); // all C++ instrinsics work too -``` - -Note that this is a GCC feature; it will probably be standartized in C++ someday - -https://gcc.gnu.org/onlinedocs/gcc-4.7.2/gcc/Vector-Extensions.html - ---- - -## Data Alignment - -The main disadvantage of SIMD is that you need to get data in vectors first - -(and sometimes preprocessing is not worth the trouble) - - ----- - -![](https://i.imgur.com/TBRhLew.png =600x) - ----- - -![](https://i.imgur.com/WNH9eCc.png =600x) - ----- - -![](https://i.imgur.com/SsDwG6D.png =600x) - ----- - -For arrays, you have two options: - -1. Pad them with neutal elements (e. g. zeros) -2. Break loop on last block and proceed normally - -Humans prefer #1, compilers prefer #2 - - ---- - -## Reductions - -* Calculating A+B is easy, because there are no data dependencies -* Calculating array sum is different: you need an accumulator from previous step -* But we can calculate $B$ partial sums $\{i+kB\}$ for each $i - -![](https://lh3.googleusercontent.com/proxy/ovyDHaTtBkntJLFOok2m17fYS0ROX0BBy-x4jG1CsYKInNRZvDMQyG-j-DOpRHR6jhYVvX2mWBLZHi2SoDwWLJ4LhofzScPtkFxko6tlYWcFyBttn7gIy0BiWWlvkIcl6BZbRBjCR5_wdniz6sIKTr1rpN7M_whxvd0IrUGpXGwI7PwKxwLslF_h9Zv8gbstlV--dyc) - - -This trick works with any other commutative operator - - ----- - -Explicitly using C++ intrinsics: - -```cpp -int sum(int a[], int n) { - int res = 0; - - // we will store 8 partial sums here - __m256i x = _mm256_setzero_si256(); - for (int i = 0; i + 8 < n; i += 8) { - __m256i y = _mm256_loadu_si256((__m256i*) &a[i]); - // add all 8 new numbers at once to their partial sums - x = _mm256_add_epi32(x, y); - } - - // sum 8 elements in our vector ("horizontal sum") - int *b = (int*) &x; - for (int i = 0; i < 8; i++) - res += b[i]; - - // add what's left of the array in case n % 8 != 0 - for (int i = (n / 8) * 8; i < n; i++) - res += a[i]; - - return res; -} -``` - -(Don't implement it yourself, compilers are smart enough to vectorize) - ----- - -![](https://www.codeproject.com/KB/cpp/874396/Fig1.jpg) - -Horizontal addition could be implemented a bit faster - ---- - -## Memory Alignment - -There are two ways to read / write a SIMD block from memory: - -* `load` / `store` that segfault when the block doesn't fit a single cache line -* `loadu` / `storeu` that always work but are slower ("u" stands for unaligned) - -When you can enforce aligned reads, always use the first one - ----- - -Assuming that both arrays are initially aligned: - -```cpp -void aplusb_unaligned() { - for (int i = 3; i + 7 < n; i += 8) { - __m256i x = _mm256_loadu_si256((__m256i*) &a[i]); - __m256i y = _mm256_loadu_si256((__m256i*) &b[i]); - __m256i z = _mm256_add_epi32(x, y); - _mm256_storeu_si256((__m256i*) &c[i], z); - } -} -``` - -...will be 30% slower than this: - -```cpp -void aplusb_aligned() { - for (int i = 0; i < n; i += 8) { - __m256i x = _mm256_load_si256((__m256i*) &a[i]); - __m256i y = _mm256_load_si256((__m256i*) &b[i]); - __m256i z = _mm256_add_epi32(x, y); - _mm256_store_si256((__m256i*) &c[i], z); - } -} -``` - -In unaligned version, half of reads will be the "bad" ones requesting two cache lines - ----- - -So always ask compiler to align memory for you: - -```cpp -alignas(32) float a[n]; - -for (int i = 0; i < n; i += 8) { - __m256 x = _mm256_load_ps(&a[i]); - // ... -} -``` - -(This is also why compilers can't always auto-vectorize efficiently) - - ---- - -## Loop Unrolling - -Simple loops often have some overhead from iterating: - -```cpp -for (int i = 1; i < n; i++) - a[i] = (i % b[i]); -``` - -It is often benefitial to "unroll" them like this: - -```cpp -int i; -for (i = 1; i < n - 3; i += 4) { - a[i] = (i % b[i]); - a[i + 1] = ((i + 1) % b[i + 1]); - a[i + 2] = ((i + 2) % b[i + 2]); - a[i + 3] = ((i + 3) % b[i + 3]); -} - -for (; i < n; i++) - a[i] = (i % b[i]); -``` - -There are trade-offs to it, and compilers are sometimes wrong -Use `#pragma unroll` and `-unroll-loops` to hint compiler what to do - ---- - -## More on Pipelining - -![](https://uops.info/pipeline.png =300x) - -https://uops.info - ----- - -For example, in Sandy Bridge family there are 6 execution ports: -* Ports 0, 1, 5 are for arithmetic and logic operations (ALU) -* Ports 2, 3 are for memory reads -* Port 4 is for memory write - -You can lookup them up in instruction tables -and see figure out which one is the bottleneck - ---- - -## SIMD + ILP - -* As all instructions, SIMD operations can be pipelined too -* To leverage it, we need to create opportunities for instruction-level parallelism -* A+B is fine, but array sum still has dependency on the previous vector -* Apply the same trick: calculate partial sums, but using multiple registers - ----- - -For example, instead of this: - -```cpp -s += a0; -s += a1; -s += a2; -s += a3; -... -``` - -...we split it between accumulators and utilize ILP: - -```cpp -s0 += a0; -s1 += a1; -s0 += a2; -s1 += a3; -... -s = s0 + s1; -``` - ---- - -## Practical Tips - -* Compile to assembly: `g++ -S ...` (or go to godbolt.org) -* See which loops get autovectorized: `g++ -fopt-info-vec-optimized ...` -* Typedefs can be handy: `typedef __m256i reg` -* You can use bitsets to "print" a SIMD register: - -```cpp -template -void print(T var) { - unsigned *val = (unsigned*) &var; - for (int i = 0; i < 4; i++) - cout << bitset<32>(val[i]) << " "; - cout << endl; -} \ No newline at end of file diff --git a/content/english/hpc/simd/auto-vectorization.md b/content/english/hpc/simd/auto-vectorization.md index 154244e1..b7b8a45f 100644 --- a/content/english/hpc/simd/auto-vectorization.md +++ b/content/english/hpc/simd/auto-vectorization.md @@ -1,34 +1,34 @@ --- -title: Auto-Vectorization +title: Auto-Vectorization and SPMD weight: 10 --- -Most often, SIMD is used for "embarrassingly parallel" computations: the ones where all you do is apply some elementwise function to all elements of an array and write it back somewhere else. In this setting, you don't even need to know how SIMD works: the compiler is perfectly capable of optimizing such loops by itself. All you need to know is that such optimization exists and yields a 5-10x speedup. +SIMD parallelism is most often used for *embarrassingly parallel* computations: the kinds where all you do is apply some elementwise function to all elements of an array and write it back somewhere else. In this setting, you don't even need to know how SIMD works: the compiler is perfectly capable of optimizing such loops by itself — you just need to be aware that such optimization exists and that it usually yields a 5-10x speedup. -But most computations are not like that, and even the loops that seem straightforward to vectorize are often not optimized because of some tricky technical nuances. In this section, we will discuss how to assist the compiler in vectorization and walk through some more complicated patterns of using SIMD. +Doing nothing and relying on auto-vectorization is actually the most popular way of using SIMD. In fact, in many cases, it even advised to stick with the plain scalar code for its simplicity and maintainability. -## Assisting Autovectorization +But often even the loops that seem straightforward to vectorize are not optimized because of some technical nuances. [As in many other cases](/hpc/compilation/contracts), the compiler may need some additional input from the programmer as he may know a bit more about the problem than what can be inferred from static analysis. -Of course, the preferred way of using SIMD is by the means of autovectorization. Whenever you can, you should always stick with the scalar code for its simplicity and maintainability. But, [as in many other cases](/hpc/analyzing-performance/compilation), compiler often needs some additional input from the programmer, who may know a little bit more about the problem. +### Potential Problems -Consider the "a+b" example: +Consider the "a + b" example we [started with](../intrinsics/#simd-intrinsics): ```c++ -void sum(int a[], int b[], int c[], int n) { +void sum(int *a, int *b, int *c, int n) { for (int i = 0; i < n; i++) c[i] = a[i] + b[i]; } ``` -This function can't be replaced with the vectorized variant automatically. Why? +Let's step into a compiler's shoes and think about what can go wrong when this loop is vectorized. -First, vectorization here is not always technically correct. Assuming that `a[]` and `c[]` intersect in a way that their beginnings differ by a single position — because who knows, maybe the programmer wanted to calculate the Fibonacci sequence through a convolution this way. In this case, the data in the SIMD blocks will intersect, and the observed behavior will differ from the one in the scalar case. +**Array size.** If the array size is unknown beforehand, it may be that it is too small for vectorization to be beneficial in the first place. Even if it is sufficiently large, we need to insert an additional check for the remainder of the loop to process it scalar, which would cost us a branch. -Second, we don't know anything about the alignment of these arrays, and we can lose some performance here by using unaligned instructions. +To eliminate these runtime checks, use array sizes that are compile-time constants, and preferably pad arrays to the nearest multiple of the SIMD block size. -On high (`-O3`) levels of optimization, when the compiler suspects that the function may be used for large cycles, it generates two implementation variants — a SIMDized and a "safe" one — and inserts runtime checks to choose between the two. +**Memory aliasing.** Even when array size issues are out of the question, vectorizing this loop is not always technically correct. For example, the arrays `a` and `c` can intersect in a way that their beginnings differ by a single position — because who knows, maybe the programmer wanted to calculate the Fibonacci sequence through a convolution this way. In this case, the data in the SIMD blocks will intersect and the observed behavior will differ from the one in the scalar case. -To avoid these runtime checks, we can tell compiler that we are sure that nothing will break. One way to do this is using the `__restrict__` keyword: +When the compiler can't prove that the function may be used for intersecting arrays, it has to generate two implementation variants — a vectorized and a "safe" one — and insert runtime checks to choose between the two. To avoid them, we can tell the compiler that we are that no memory is aliased by adding the `__restrict__` keyword: ```cpp void add(int * __restrict__ a, const int * __restrict__ b, int n) { @@ -37,7 +37,7 @@ void add(int * __restrict__ a, const int * __restrict__ b, int n) { } ``` -The other, specific to SIMD, is the "ignore vector dependencies" pragma, which is the way to tell compiler that we are sure there are no dependencies between the loop iterations: +The other way, specific to SIMD, is the "ignore vector dependencies" pragma. It is a general way to inform the compiler that there are no dependencies between the loop iterations: ```c++ #pragma GCC ivdep @@ -45,6 +45,22 @@ for (int i = 0; i < n; i++) // ... ``` -There are [many other ways](https://software.intel.com/sites/default/files/m/4/8/8/2/a/31848-CompilerAutovectorizationGuide.pdf) of hinting compiler what we meant exactly, but in especially complex cases — when inside the loop there are a lot of branches or some functions are called — it is easier to go down to the intrinsics level and write it yourself. +**Alignment.** The compiler also doesn't know anything about the alignment of these arrays and has to either process some elements at the beginning of these arrays before starting the vectorized section or potentially lose some performance by using [unaligned memory accesses](../moving). -`std::assume_aligned`, specifiers. This is useful for SIMD instructions that need memory alignment guarantees +To help the compiler eliminate this corner case, we can use the `alignas` specifier on static arrays and the `std::assume_aligned` function to mark pointers aligned. + +**Checking if vectorization happened.** In either case, it is useful to check if the compiler vectorized the loop the way you intended. You can either [compiling it to assembly](/hpc/compilation/stages) and look for blocks for instructions that start with a "v" or add the `-fopt-info-vec-optimized` compiler flag so that the compiler indicates where auto-vectorization is happening and what SIMD width is being used. If you swap `optimized` for `missed` or `all`, you may also get some reasoning behind why it is not happening in other places. + +There are [many other ways](https://software.intel.com/sites/default/files/m/4/8/8/2/a/31848-CompilerAutovectorizationGuide.pdf) of telling the compiler exactly what we mean, but in especially complex cases — e.g., when there are a lot of branches or function calls inside the loop — it is easier to go one level of abstraction down and vectorize manually. + +### SPMD + +There is a neat compromise between auto-vectorization and the manual use of SIMD intrinsics: "single program, multiple data" (SPMD). This is a model of computation in which the programmer writes what appears to be a regular serial program, but that is actually executed in parallel on the hardware. + +The programming experience is largely the same, and there is still the fundamental limitation in that the computation must be data-parallel, but SPMD ensures that the vectorization will happen regardless of the compiler and the target CPU architecture. It also allows for the computation to be automatically parallelized across multiple cores and, in some cases, even offloaded to other types of parallel hardware. + +There is support for SPMD is some modern languages ([Julia](https://docs.julialang.org/en/v1/base/base/#Base.SimdLoop.@simd)), multiprocessing APIs ([OpenMP](https://www.openmp.org/spec-html/5.0/openmpsu42.html)), and specialized compilers (Intel [ISPC](https://ispc.github.io/)), but it has seen the most success in the context of GPU programming where both problems and hardware are massively parallel. + +We will cover this model of computation in much more depth in Part 2 + + diff --git a/content/english/hpc/simd/cookbook.md b/content/english/hpc/simd/cookbook.md deleted file mode 100644 index 90a3e8f4..00000000 --- a/content/english/hpc/simd/cookbook.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: SSE & AVX Cookbook -weight: 11 -draft: true ---- - -## Constexpr - -## Popcnt - -### Naive - -### 8-bit lookup - -### gather - -### pshufb diff --git a/content/english/hpc/simd/img/filter.svg b/content/english/hpc/simd/img/filter.svg new file mode 100644 index 00000000..99422714 --- /dev/null +++ b/content/english/hpc/simd/img/filter.svg @@ -0,0 +1,1401 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/simd/img/gather-scatter.png b/content/english/hpc/simd/img/gather-scatter.png new file mode 100644 index 00000000..a9f829fe Binary files /dev/null and b/content/english/hpc/simd/img/gather-scatter.png differ diff --git a/content/english/hpc/simd/img/gather.svg b/content/english/hpc/simd/img/gather.svg new file mode 100644 index 00000000..5f35484f --- /dev/null +++ b/content/english/hpc/simd/img/gather.svg @@ -0,0 +1,1233 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/english/hpc/simd/intrinsics.md b/content/english/hpc/simd/intrinsics.md index 88d2c861..4e9c6804 100644 --- a/content/english/hpc/simd/intrinsics.md +++ b/content/english/hpc/simd/intrinsics.md @@ -6,15 +6,15 @@ weight: 1 The most low-level way to use SIMD is to use the assembly vector instructions directly — they aren't different from their scalar equivalents at all — but we are not going to do that. Instead, we will use *intrinsic* functions mapping to these instructions that are available in modern C/C++ compilers. -In this section, we will go through the basics of their syntax, and in the rest of this chapter we will use them extensively to do things that are actually interesting. +In this section, we will go through the basics of their syntax, and in the rest of this chapter, we will use them extensively to do things that are actually interesting. ## Setup To use x86 intrinsics, we need to do a little groundwork. -First, we need to determine which extensions are supported by the hardware. On Linux, you can call `cat /proc/cpuinfo`, and on other platforms you'd better go to [WikiChip](https://en.wikichip.org/wiki/WikiChip) and look it up there using the name of the CPU. In either case, there should be a `flags` section that lists the codes of all supported vector extensions. +First, we need to determine which extensions are supported by the hardware. On Linux, you can call `cat /proc/cpuinfo`, and on other platforms, you'd better go to [WikiChip](https://en.wikichip.org/wiki/WikiChip) and look it up there using the name of the CPU. In either case, there should be a `flags` section that lists the codes of all supported vector extensions. -There is also a special [CPUID](https://en.wikipedia.org/wiki/CPUID) assembly instruction that lets you query various information about the CPU, including the support of particular vector extensions. It is primarily used to get such information in runtime in order to avoid distributing a separate binary for each microarchitecture. Its output information is returned very densely in the form of feature masks, so compilers provide built-in methods to make sense of it. Here is an example: +There is also a special [CPUID](https://en.wikipedia.org/wiki/CPUID) assembly instruction that lets you query various information about the CPU, including the support of particular vector extensions. It is primarily used to get such information in runtime and avoid distributing a separate binary for each microarchitecture. Its output information is returned very densely in the form of feature masks, so compilers provide built-in methods to make sense of it. Here is an example: ```c++ #include @@ -31,9 +31,9 @@ int main() { } ``` -Second, we need to include a header file that contains the subset of intrinsics we need. Similar to `` in GCC, there is `` header that contains all of them, so we will just use that. +Second, we need to include a header file that contains the subset of intrinsics we need. Similar to `` in GCC, there is the `` header that contains all of them, so we will just use that. -And last, we need to tell the compiler that the target CPU actually supports these extensions. This can be done either with `#pragma GCC target(...)` [as we did before](../), or with `-march=...` flag in the compiler options. If you are compiling and running the code on the same machine, you can set `-march=native` to auto-detect the microarchitecture. +And last, we need to [tell the compiler](/hpc/compilation/flags) that the target CPU actually supports these extensions. This can be done either with `#pragma GCC target(...)` [as we did before](../), or with the `-march=...` flag in the compiler options. If you are compiling and running the code on the same machine, you can set `-march=native` to auto-detect the microarchitecture. In all further code examples, assume that they begin with these lines: @@ -47,9 +47,9 @@ In all further code examples, assume that they begin with these lines: using namespace std; ``` -We will focus on AVX2 and the previous SIMD extensions in this chapter, which should be available on 95% of all desktop and server computers, although the general principles transfer on AVX512, Arm Neon and other SIMD architectures just as well. +We will focus on AVX2 and the previous SIMD extensions in this chapter, which should be available on 95% of all desktop and server computers, although the general principles transfer on AVX512, Arm Neon, and other SIMD architectures just as well. -## SIMD Registers +### SIMD Registers The most notable distinction between SIMD extensions is the support for wider registers: @@ -67,9 +67,9 @@ C/C++ compilers implement special *vector types* that refer to the data stored i - 256-bit `__m256`, `__m256d`, `__m256i`; - 512-bit `__m512`, `__m512d`, `__m512i`. -Registers themselves can hold data of any kind: these types are only used for type checking. To convert a variable to another type, you can do it the same way you would convert any other type, and it won't cost you anything. +Registers themselves can hold data of any kind: these types are only used for type checking. You can convert a vector variable to another vector type the same way you would normally convert any other type, and it won't cost you anything. -## SIMD Intrinsics +### SIMD Intrinsics *Intrinsics* are just C-style functions that do something with these vector data types, usually by simply calling the associated assembly instruction. @@ -95,17 +95,14 @@ for (int i = 0; i < 100; i += 4) { The main challenge of using SIMD is getting the data into contiguous fixed-sized blocks suitable for loading into registers. In the code above, we may in general have a problem if the length of the array is not divisible by the block size. There are two common solutions to this: -1. We can "overshoot" by iterating over the last incomplete segment either way. To make sure sure we don't segfault by trying to read from or write to a memory region we don't own, we need to pad the arrays to the nearest block size (typically with some "neutral" element, e. g. zero). +1. We can "overshoot" by iterating over the last incomplete segment either way. To make sure we don't segfault by trying to read from or write to a memory region we don't own, we need to pad the arrays to the nearest block size (typically with some "neutral" element, e.g., zero). 2. Make one iteration less and write a little loop in the end that calculates the remainder normally (with scalar operations). -Humans prefer #1, because it is simpler and results in less code. Compilers prefer #2, because they don't really have another legal option. +Humans prefer #1 because it is simpler and results in less code, and compilers prefer #2 because they don't really have another legal option. ### Instruction References -are all generated by cats walking on keyboards. -If I'm wrong, explain this: punpcklqdq - -Most SIMD intrinsics follow a naming convention similar to `_mm__`, and are relatively self-explanatory once you get used to the assembly naming conventions. +Most SIMD intrinsics follow a naming convention similar to `_mm__` and correspond to a single analogously named assembly instruction. They become relatively self-explanatory once you get used to the assembly naming conventions, although sometimes it does seem like their names were generated by cats walking on keyboards (explain this: [punpcklqdq](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=3037,3009,4870,4870,4872,4875,833,879,874,849,848,6715,4845,6046,3853,288,6570,6527,6527,90,7307,6385,5993,2692,6946,6949,5456,6938,5456,1021,3007,514,518,4875,7253,7183,3892,5135,5260,5259,6385,3915,4027,3873,7401&techs=AVX,AVX2&text=punpcklqdq)). Here are a few more examples, just so that you get the gist of it: @@ -116,13 +113,15 @@ Here are a few more examples, just so that you get the gist of it: - `_mm256_cmpeq_epi32`: compare 8+8 packed `int`s and return a mask that contains ones for equal element pairs. - `_mm256_blendv_ps`: pick elements from one of two vectors according to a mask. -As you may have guessed, there is a combinatorially very large number of intrinsics. A very helpful reference for x86 SIMD intrinsics is the [Intel Intrinsics Guide](https://software.intel.com/sites/landingpage/IntrinsicsGuide/), which has groupings by categories and extensions, descriptions, pseudocode, associated assembly instructions, and their latency and throughput on Intel microarchitectures. You may want to bookmark that page. +As you may have guessed, there is a combinatorially very large number of intrinsics, and in addition to that, some instructions also have immediate values — so their intrinsics require compile-time constant parameters: for example, the floating-point comparison instruction [has 32 different modifiers](https://stackoverflow.com/questions/16988199/how-to-choose-avx-compare-predicate-variants). -The Intel reference is useful when you know that a specific instruction exists and just want to look up its name or performance info. When you don't know whether it exists, this [cheat sheet](https://db.in.tum.de/~finis/x86%20intrinsics%20cheat%20sheet%20v1.0.pdf) may do a better job. +For some reason, there are some operations that are agnostic to the type of data stored in registers, but only take a specific vector type (usually 32-bit float) — you just have to convert to and from it to use that intrinsic. To simplify the examples in this chapter, we will mostly work with 32-bit integers (`epi32`) in 256-bit AVX2 registers. -### Instruction Selection +A very helpful reference for x86 SIMD intrinsics is the [Intel Intrinsics Guide](https://software.intel.com/sites/landingpage/IntrinsicsGuide/), which has groupings by categories and extensions, descriptions, pseudocode, associated assembly instructions, and their latency and throughput on Intel microarchitectures. You may want to bookmark that page. + +The Intel reference is useful when you know that a specific instruction exists and just want to look up its name or performance info. When you don't know whether it exists, this [cheat sheet](https://db.in.tum.de/~finis/x86%20intrinsics%20cheat%20sheet%20v1.0.pdf) may do a better job. -Note that compilers do not necessarily pick the exact instruction that you specify. Similar to the scalar `c = a + b` we [discussed before](/hpc/analyzing-performance/assembly), there is a fused vector addition instruction too, so instead of using 2+1+1=4 instructions per loop cycle, compiler [rewrites the code above](https://godbolt.org/z/dMz8E5Ye8) with blocks of 3 instructions like this: +**Instruction selection.** Note that compilers do not necessarily pick the exact instruction that you specify. Similar to the scalar `c = a + b` we [discussed before](/hpc/analyzing-performance/assembly), there is a fused vector addition instruction too, so instead of using 2+1+1=4 instructions per loop cycle, compiler [rewrites the code above](https://godbolt.org/z/dMz8E5Ye8) with blocks of 3 instructions like this: ```nasm vmovapd ymm1, YMMWORD PTR a[rax] @@ -130,9 +129,17 @@ vaddpd ymm0, ymm1, YMMWORD PTR b[rax] vmovapd YMMWORD PTR c[rax], ymm0 ``` -Also, some of the intrinsics are not direct instructions, but short sequences of instructions. One example is the `extract` group of instructions, which are used to get individual elements out of vectors (e. g. `_mm256_extract_epi32(x, 0)` returns the first element out of 8-integer vector); it is quite slow (~5 cycles) to move data between "normal" and SIMD registers in general. +Sometimes, although quite rarely, this compiler interference makes things worse, so it is always a good idea to [check the assembly](/hpc/compilation/stages) and take a closer look at the emitted vector instructions (they usually start with a "v"). + +Also, some of the intrinsics don't map to a single instruction but a short sequence of them, as a convenient shortcut: [broadcasts and extracts](../moving#register-aliasing) are a notable example. + + + +### GCC Vector Extensions If you feel like the design of C intrinsics is terrible, you are not alone. I've spent hundreds of hours writing SIMD code and reading the Intel Intrinsics Guide, and I still can't remember whether I need to type `_mm256` or `__m256`. @@ -140,7 +147,7 @@ Intrinsics are not only hard to use but also neither portable nor maintainable. One day, compiler engineers from the GNU Project thought the same way and developed a way to define your own vector types that feel more like arrays with some operators overloaded to match the relevant instructions. -In GCC, here is how you can define vector of 8 integers packed into a 256-bit (32-byte) register: +In GCC, here is how you can define a vector of 8 integers packed into a 256-bit (32-byte) register: ```c++ typedef int v8si __attribute__ (( vector_size(32) )); @@ -149,7 +156,7 @@ typedef int v8si __attribute__ (( vector_size(32) )); Unfortunately, this is not a part of the C or C++ standard, so different compilers use different syntax for that. -There is somewhat of a naming convention, which is to include size and type of elements into the name of the type: in the example above, we defined a "vector of 8 signed integers". But you may choose any name you want, like `vec`, `reg` or whatever. The only thing you don't want to do is to name it `vector` because of how much confusion there would be because of `std::vector`. +There is somewhat of a naming convention, which is to include size and type of elements into the name of the type: in the example above, we defined a "vector of 8 signed integers." But you may choose any name you want, like `vec`, `reg` or whatever. The only thing you don't want to do is to name it `vector` because of how much confusion there would be because of `std::vector`. The main advantage of using these types is that for many operations you can use normal C++ operators instead of looking up the relevant intrinsic. @@ -178,24 +185,13 @@ for (int i = 0; i < 100/4; i++) c[i] = a[i] + b[i]; ``` -As you can see, vector extensions are much cleaner compared to the nightmare we have with intrinsic functions. But some things that we may want to do are just not expressible with native C++ constructs, so we will still need intrinsics. Luckily, this is not an exclusive choice, because vector types support zero-cost conversion to the `_mm` types and back. We will, however, try to avoid doing so as much as possible and stick to vector extensions when we can. - -## Tips - -First of all, it is very useful to check if vectorization happened the way you intended by [compiling it to assembly](/hpc/analyzing-performance/compilation) and taking a close look at the emitted instructions that start with "v". - -Also, if you specify the `-fopt-info-vec-optimized` flag, then compiler will directly indicate where autovectorization is happening and what SIMD width is being used. If you swap `optimized` for `missed` or `all`, you may also get reasons why it is not happening in other places. - -When using SIMD manually, it helps to print out contents of vector registers for debug purposes. You can do so by converting a vector variable into an array and then into a bitset: +As you can see, vector extensions are much cleaner compared to the nightmare we have with intrinsic functions. Their downside is that there are some things that we may want to do are just not expressible with native C++ constructs, so we will still need intrinsics for them. Luckily, this is not an exclusive choice, because vector types support zero-cost conversion to the `_mm` types and back: ```c++ -template -void print(T var) { - unsigned *val = (unsigned*) &var; - for (int i = 0; i < 4; i++) - cout << bitset<32>(val[i]) << " "; - cout << endl; -} +v8f x; +int mask = _mm256_movemask_ps((__m256) x) ``` -In this particular case, it outputs 4 groups of 32 bits of a 128-bit wide vector. +There are also many third-party libraries for different languages that provide a similar capability to write portable SIMD code and also implement some, and just in general are nicer to use than both intrinsics and built-in vector types. Notable examples for C++ are [Highway](https://github.com/google/highway), [Expressive Vector Engine](https://github.com/jfalcou/eve), [Vector Class Library](https://github.com/vectorclass/version2), and [xsimd](https://github.com/xtensor-stack/xsimd). + +Using a well-established SIMD library is recommended as it greatly improves the developer experience. In this book, however, we will try to keep close to the hardware and mostly use intrinsics directly, occasionally switching to the vector extensions for simplicity when we can. diff --git a/content/english/hpc/simd/loading.md b/content/english/hpc/simd/loading.md deleted file mode 100644 index 5d1d75ba..00000000 --- a/content/english/hpc/simd/loading.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -title: Loading and Writing Data -aliases: [/hpc/simd/vectorization] -weight: 2 ---- - -Operations of reading and writing the contents of a SIMD register into memory have two versions each: `load` / `loadu` and `store` / `storeu`. The letter "u" here stands for "unaligned". The difference is that the former ones only work correctly when the read / written block fits inside a single cache line (and crash otherwise), while the latter work either way, but with a slight performance penalty if the block crosses a cache line. - -Sometimes, especially when the "inner" operation is very lightweight, the performance difference becomes significant (at least because you need to fetch two cache lines instead of one). As an extreme example, this way of adding two arrays together: - -```c++ -for (int i = 3; i + 7 < n; i += 8) { - __m256i x = _mm256_loadu_si256((__m256i*) &a[i]); - __m256i y = _mm256_loadu_si256((__m256i*) &b[i]); - __m256i z = _mm256_add_epi32(x, y); - _mm256_storeu_si256((__m256i*) &c[i], z); -} -``` - -…is ~30% slower than its aligned version: - -```c++ -for (int i = 0; i < n; i += 8) { - __m256i x = _mm256_load_si256((__m256i*) &a[i]); - __m256i y = _mm256_load_si256((__m256i*) &b[i]); - __m256i z = _mm256_add_epi32(x, y); - _mm256_store_si256((__m256i*) &c[i], z); -} -``` - -In the first version, assuming that arrays `a`, `b` and `c` are all 64-byte *aligned* (the addresses of their first elements are divisible by 64, and so they start at the beginning of a cache line), roughly half of reads and writes will be "bad" because they cross a cache line boundary. - -### Data Alignment - -By default, when you allocate an array, the only guarantee about its alignment you get is that none of its elements are split by a cache line. For an array of `int`, this means that it gets the alignment of 4 bytes (`sizeof int`), which lets you load exactly one cache line when reading any element. - -For our purposes, we want to guarantee that any (256-bit = 32-byte) SIMD block will not be split, so we need to specify the alignment of 32 bytes. For static arrays, we can do so with the `alignas` specifier: - -```c++ -alignas(32) float a[n]; - -for (int i = 0; i < n; i += 8) { - __m256 x = _mm256_load_ps(&a[i]); - // ... -} -``` - -For allocating an array dynamically, we can use `std::aligned_alloc` which takes the alignment value and the size of array in bytes, and returns a pointer to the allocated memory (just like `new` does), which should be explicitly deleted when no longer used. - -On most modern architectures, the `loadu` / `storeu` intrinsics should be equally as fast as `load` / `store` given that in both cases the blocks only intersect one cache line. The advantage of the latter is that they can act as free assertions that all reads and writes are aligned. It is worth noting that the GCC vector extensions always assume aligned memory reads and writes. Memory alignment issues is also one of the reasons why compilers can't always autovectorize efficiently. - -### Register Aliasing - -MMX was originally used the integer (64-bit mantissa) part of a 80-bit float. - - diff --git a/content/english/hpc/simd/masking.md b/content/english/hpc/simd/masking.md index 489d2ff2..dbe71575 100644 --- a/content/english/hpc/simd/masking.md +++ b/content/english/hpc/simd/masking.md @@ -3,81 +3,347 @@ title: Masking and Blending weight: 4 --- -If you took some time to study [the reference](https://software.intel.com/sites/landingpage/IntrinsicsGuide), you may have noticed that there are essentially two major groups of vector operations: +One of the bigger challenges of SIMD programming is that its options for control flow are very limited — because the operations you apply to a vector are the same for all its elements. -1. Instructions that perform some elementwise operation (`+`, `*`, `<`, `acos`, etc.). -2. Instructions that load, store, mask, shuffle and generally move data around. - -While using the elementwise instructions is easy, the largest challenge with SIMD is getting the data in vector registers in the first place, with low enough overhead so that the whole endeavor is worthwhile. +This makes the problems that are usually trivially resolved with an `if` or any other type of branching much harder. With SIMD, they have to be dealt with by the means of various [branchless programming](/hpc/pipelining/branchless) techniques, which aren't always that straightforward to apply. ### Masking -SIMD has no easy way to do branching, because the control flow should be the same for all elements in a vector. To overcome this limitation, we can "mask" operations that should only be performed on a subset of elements, in a way similar to how a [conditional move](/hpc/analyzing-performance/assembly) is executed. +The main way to make a computation branchless is through *predication* — computing the results of both branches and then using either some arithmetic trick or a special "conditional move" instruction: + +```c++ +for (int i = 0; i < N; i++) + a[i] = rand() % 100; + +int s = 0; + +// branch: +for (int i = 0; i < N; i++) + if (a[i] < 50) + s += a[i]; + +// no branch: +for (int i = 0; i < N; i++) + s += (a[i] < 50) * a[i]; + +// also no branch: +for (int i = 0; i < N; i++) + s += (a[i] < 50 ? a[i] : 0); +``` + +To vectorize this loop, we are going to need two new instructions: + +- `_mm256_cmpgt_epi32`, which compares the integers in two vectors and produces a mask of all ones if the first element is more than the second and a mask of full zeros otherwise. +- `_mm256_blendv_epi8`, which blends (combines) the values of two vectors based on the provided mask. + +By masking and blending the elements of a vector so that only the selected subset of them is affected by computation, we can perform predication in a manner similar to the conditional move: + +```c++ +const reg c = _mm256_set1_epi32(49); +const reg z = _mm256_setzero_si256(); +reg s = _mm256_setzero_si256(); + +for (int i = 0; i < N; i += 8) { + reg x = _mm256_load_si256( (reg*) &a[i] ); + reg mask = _mm256_cmpgt_epi32(x, c); + x = _mm256_blendv_epi8(x, z, mask); + s = _mm256_add_epi32(s, x); +} +``` + +(Minor details such as [horizontal summation and accounting for the remainder of the array](../reduction) are omitted for brevity.) + +This is how predication is usually done in SIMD, but it isn't always the most optimal approach. We can use the fact that one of the blended values is zero, and use bitwise `and` with the mask instead of blending: + +```c++ +const reg c = _mm256_set1_epi32(50); +reg s = _mm256_setzero_si256(); + +for (int i = 0; i < N; i += 8) { + reg x = _mm256_load_si256( (reg*) &a[i] ); + reg mask = _mm256_cmpgt_epi32(c, x); + x = _mm256_and_si256(x, mask); + s = _mm256_add_epi32(s, x); +} +``` + +This loop performs slightly faster because on this particular CPU, the vector `and` takes one cycle less than `blend`. + +Several other instructions support masks as inputs, most notably: + +- The `_mm256_blend_epi32` intrinsic is a `blend` that takes an 8-bit integer mask instead of a vector (which is why it doesn't have `v` at the end). +- The `_mm256_maskload_epi32` and `_mm256_maskstore_epi32` intrinsics that load/store a SIMD block from memory and `and` it with a mask in one go. + +We can also use predication with built-in vector types: + +```c++ +vec *v = (vec*) a; +vec s = {}; + +for (int i = 0; i < N / 8; i++) + s += (v[i] < 50 ? v[i] : 0); +``` + +All these versions work at around 13 GFLOPS as this example is so simple that the compiler can vectorize the loop all by itself. Let's move on to more complex examples that can't be auto-vectorized. + +### Searching -Consider the following problem: for some reason, we need to raise $10^8$ random integers to some random powers. +In the next example, we need to find a specific value in an array and return its position (aka `std::find`): ```c++ -const int n = 1e8; -alignas(32) unsigned bases[n], results[n], powers[n]; +const int N = (1<<12); +int a[N]; + +int find(int x) { + for (int i = 0; i < N; i++) + if (a[i] == x) + return i; + return -1; +} ``` -In SSE/AVX, [doing modular reduction](/hpc/arithmetic/integer) is even more complicated than in the scalar case (e. g. SSE has no integer division in the first place), so we will perform all operations modulo $2^{32}$ by naturally overflowing an `unsigned int`. +To benchmark the `find` function, we fill the array with numbers from $0$ to $(N - 1)$ and then repeatedly search for a random element: -We'd normally do it by exponentiation by squaring: +```c++ +for (int i = 0; i < N; i++) + a[i] = i; + +for (int t = 0; t < K; t++) + checksum ^= find(rand() % N); +``` + +The scalar version gives ~4 GFLOPS of performance. This number includes the elements we haven't had to process, so divide this number by two in your head (the expected fraction of the elements we have to check). + +To vectorize it, we need to compare a vector of its elements with the searched value for equality, producing a mask, and then somehow check if this mask is zero. If it isn't, the needed element is somewhere within this block of 8. + +To check if the mask is zero, we can use the `_mm256_movemask_ps` intrinsic, which takes the first bit of each 32-bit element in a vector and produces an 8-bit integer mask out of them. We can then check if this mask is non-zero — and if it is, also immediately get the index with the `ctz` instruction: ```c++ -void binpow_simple() { - for (int i = 0; i < n; i++) { - unsigned a = bases[i], p = powers[i]; - - unsigned res = 1; - while (p > 0) { - if (p & 1) - res = (res * a); - a = (a * a); - p >>= 1; +int find(int needle) { + reg x = _mm256_set1_epi32(needle); + + for (int i = 0; i < N; i += 8) { + reg y = _mm256_load_si256( (reg*) &a[i] ); + reg m = _mm256_cmpeq_epi32(x, y); + int mask = _mm256_movemask_ps((__m256) m); + if (mask != 0) + return i + __builtin_ctz(mask); + } + + return -1; +} +``` + +This version gives ~20 GFLOPS or about 5 times faster than the scalar one. It only uses 3 instructions in the hot loop: + +```nasm +vpcmpeqd ymm0, ymm1, YMMWORD PTR a[0+rdx*4] +vmovmskps eax, ymm0 +test eax, eax +je loop +``` + +Checking if a vector is zero is a common operation, and there is an operation similar to `test` in SIMD that we can use: + +```c++ +int find(int needle) { + reg x = _mm256_set1_epi32(needle); + + for (int i = 0; i < N; i += 8) { + reg y = _mm256_load_si256( (reg*) &a[i] ); + reg m = _mm256_cmpeq_epi32(x, y); + if (!_mm256_testz_si256(m, m)) { + int mask = _mm256_movemask_ps((__m256) m); + return i + __builtin_ctz(mask); } + } + + return -1; +} +``` + +We are still using `movemask` to do `ctz` later, but the hot loop is now one instruction shorter: - results[i] = res; +```nasm +vpcmpeqd ymm0, ymm1, YMMWORD PTR a[0+rdx*4] +vptest ymm0, ymm0 +je loop +``` + +This doesn't improve performance much because both both `vptest` and `vmovmskps` have a throughput of one and will bottleneck the computation regardless of anything else we do in the loop. + +To work around this limitation, we can iterate in blocks of 16 elements and combine the results of independent comparisons of two 256-bit AVX2 registers using a bitwise `or`: + +```c++ +int find(int needle) { + reg x = _mm256_set1_epi32(needle); + + for (int i = 0; i < N; i += 16) { + reg y1 = _mm256_load_si256( (reg*) &a[i] ); + reg y2 = _mm256_load_si256( (reg*) &a[i + 8] ); + reg m1 = _mm256_cmpeq_epi32(x, y1); + reg m2 = _mm256_cmpeq_epi32(x, y2); + reg m = _mm256_or_si256(m1, m2); + if (!_mm256_testz_si256(m, m)) { + int mask = (_mm256_movemask_ps((__m256) m2) << 8) + + _mm256_movemask_ps((__m256) m1); + return i + __builtin_ctz(mask); + } } + + return -1; } ``` -This code runs in 9.47 seconds. +With this obstacle removed, the performance now peaks at ~34 GFLOPS. But why not 40? Shouldn't it be twice as fast? + +Here is how one iteration of the loop looks in assembly: + +```nasm +vpcmpeqd ymm2, ymm1, YMMWORD PTR a[0+rdx*4] +vpcmpeqd ymm3, ymm1, YMMWORD PTR a[32+rdx*4] +vpor ymm0, ymm3, ymm2 +vptest ymm0, ymm0 +je loop +``` + +Every iteration, we need to execute 5 instructions. While the throughputs of all relevant execution ports allow to do that in one cycle on average, we can't do that because the decode width of this particular CPU (Zen 2) is 4. Therefore, the performance is limited by ⅘ of what it could have been. + + + +To mitigate this, we can once again double the number of SIMD blocks we process on each iteration: ```c++ -typedef __m256i reg; - -void binpow_simd() { - const reg ones = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); - for (int i = 0; i < n; i += 8) { - reg a = _mm256_load_si256((__m256i*) &bases[i]); - reg p = _mm256_load_si256((__m256i*) &powers[i]); - reg res = ones; - - // in fact, there will not be a cycle here: - // the compiler should unroll it in 32 separate blocks of operations - for (int l = 0; l < 32; l++) { - // instead of explicit branching, calculate a "multiplier" for every element: - // it is either 1 or a, depending on the lowest bit of p - - // masks of elements that should be multiplied by a: - reg mask = _mm256_cmpeq_epi32(_mm256_and_si256(p, ones), ones); - // now we blend a vector of ones and a vector of a using this mask: - reg mul = _mm256_blendv_epi8(ones, a, mask); - // res *= mul: - res = _mm256_mullo_epi32(res, mul); - // a *= a: - a = _mm256_mullo_epi32(a, a); - // p >>= 1: - p = _mm256_srli_epi32(p, 1); +unsigned get_mask(reg m) { + return _mm256_movemask_ps((__m256) m); +} + +reg cmp(reg x, int *p) { + reg y = _mm256_load_si256( (reg*) p ); + return _mm256_cmpeq_epi32(x, y); +} + +int find(int needle) { + reg x = _mm256_set1_epi32(needle); + + for (int i = 0; i < N; i += 32) { + reg m1 = cmp(x, &a[i]); + reg m2 = cmp(x, &a[i + 8]); + reg m3 = cmp(x, &a[i + 16]); + reg m4 = cmp(x, &a[i + 24]); + reg m12 = _mm256_or_si256(m1, m2); + reg m34 = _mm256_or_si256(m3, m4); + reg m = _mm256_or_si256(m12, m34); + if (!_mm256_testz_si256(m, m)) { + unsigned mask = (get_mask(m4) << 24) + + (get_mask(m3) << 16) + + (get_mask(m2) << 8) + + get_mask(m1); + return i + __builtin_ctz(mask); } + } + + return -1; +} +``` + +It now shows the throughput of 43 GFLOPS — or about 10x faster than the original scalar implementation. + +Extending it to 64 values per cycle doesn't help: small arrays suffer from the overhead of all these additional `movemask`-s when we hit the condition, and larger arrays are bottlenecked by [memory bandwidth](/hpc/cpu-cache/bandwidth) anyway. + +### Counting Values - _mm256_store_si256((__m256i*) &results[i], res); +As the final exercise, let's find the count of a value in an array instead of just its first occurrence: + +```c++ +int count(int x) { + int cnt = 0; + for (int i = 0; i < N; i++) + cnt += (a[i] == x); + return cnt; +} +``` + +To vectorize it, we just need to convert the comparison mask to either one or zero per element and calculate the sum: + +```c++ +const reg ones = _mm256_set1_epi32(1); + +int count(int needle) { + reg x = _mm256_set1_epi32(needle); + reg s = _mm256_setzero_si256(); + + for (int i = 0; i < N; i += 8) { + reg y = _mm256_load_si256( (reg*) &a[i] ); + reg m = _mm256_cmpeq_epi32(x, y); + m = _mm256_and_si256(m, ones); + s = _mm256_add_epi32(s, m); } + + return hsum(s); } ``` -This implementation now works in 0.7 seconds, or 13.5 times faster, and there is still ample room for improvement. +Both implementations yield ~15 GFLOPS: the compiler can vectorize the first one all by itself. + +But a trick that the compiler can't find is to notice that the mask of all ones is [minus one](/hpc/arithmetic/integer) when reinterpreted as an integer. So we can skip the and-the-lowest-bit part and use the mask itself, and then just negate the final result: + +```c++ +int count(int needle) { + reg x = _mm256_set1_epi32(needle); + reg s = _mm256_setzero_si256(); + + for (int i = 0; i < N; i += 8) { + reg y = _mm256_load_si256( (reg*) &a[i] ); + reg m = _mm256_cmpeq_epi32(x, y); + s = _mm256_add_epi32(s, m); + } + + return -hsum(s); +} +``` + +This doesn't improve the performance in this particular architecture because the throughput is actually bottlenecked by updating `s`: there is a dependency on the previous iteration, so the loop can't proceed faster than one iteration per CPU cycle. We can make use of [instruction-level parallelism](../reduction#instruction-level-parallelism) if we split the accumulator in two: + +```c++ +int count(int needle) { + reg x = _mm256_set1_epi32(needle); + reg s1 = _mm256_setzero_si256(); + reg s2 = _mm256_setzero_si256(); + + for (int i = 0; i < N; i += 16) { + reg y1 = _mm256_load_si256( (reg*) &a[i] ); + reg y2 = _mm256_load_si256( (reg*) &a[i + 8] ); + reg m1 = _mm256_cmpeq_epi32(x, y1); + reg m2 = _mm256_cmpeq_epi32(x, y2); + s1 = _mm256_add_epi32(s1, m1); + s2 = _mm256_add_epi32(s2, m2); + } + + s1 = _mm256_add_epi32(s1, s2); + + return -hsum(s1); +} +``` + +It now gives ~22 GFLOPS of performance, which is as high as it can get. + +When adapting this code for shorter data types, keep in mind that the accumulator may overflow. To work around this, add another accumulator of larger size and regularly stop the loop to add the values in the local accumulator to it and then reset the local accumulator. For example, for 8-bit integers, this means creating another inner loop that does $\lfloor \frac{256-1}{8} \rfloor = 15$ iterations. + + + diff --git a/content/english/hpc/simd/moving.md b/content/english/hpc/simd/moving.md new file mode 100644 index 00000000..72cbbd33 --- /dev/null +++ b/content/english/hpc/simd/moving.md @@ -0,0 +1,216 @@ +--- +title: Moving Data +aliases: [/hpc/simd/vectorization] +weight: 2 +--- + +If you took some time to study [the reference](https://software.intel.com/sites/landingpage/IntrinsicsGuide), you may have noticed that there are essentially two major groups of vector operations: + +1. Instructions that perform some elementwise operation (`+`, `*`, `<`, `acos`, etc.). +2. Instructions that load, store, mask, shuffle, and generally move data around. + +While using the elementwise instructions is easy, the largest challenge with SIMD is getting the data in vector registers in the first place, with low enough overhead so that the whole endeavor is worthwhile. + +### Aligned Loads and Stores + +Operations of reading and writing the contents of a SIMD register into memory have two versions each: `load` / `loadu` and `store` / `storeu`. The letter "u" here stands for "unaligned." The difference is that the former ones only work correctly when the read / written block fits inside a single [cache line](/hpc/cpu-cache/cache-lines) (and crash otherwise), while the latter work either way, but with a slight performance penalty if the block crosses a cache line. + +Sometimes, especially when the "inner" operation is very lightweight, the performance difference becomes significant (at least because you need to fetch two cache lines instead of one). As an extreme example, this way of adding two arrays together: + +```c++ +for (int i = 3; i + 7 < n; i += 8) { + __m256i x = _mm256_loadu_si256((__m256i*) &a[i]); + __m256i y = _mm256_loadu_si256((__m256i*) &b[i]); + __m256i z = _mm256_add_epi32(x, y); + _mm256_storeu_si256((__m256i*) &c[i], z); +} +``` + +…is ~30% slower than its aligned version: + +```c++ +for (int i = 0; i < n; i += 8) { + __m256i x = _mm256_load_si256((__m256i*) &a[i]); + __m256i y = _mm256_load_si256((__m256i*) &b[i]); + __m256i z = _mm256_add_epi32(x, y); + _mm256_store_si256((__m256i*) &c[i], z); +} +``` + +In the first version, assuming that arrays `a`, `b` and `c` are all 64-byte *aligned* (the addresses of their first elements are divisible by 64, and so they start at the beginning of a cache line), roughly half of reads and writes will be "bad" because they cross a cache line boundary. + +Note that the performance difference is caused by the cache system and not by the instructions themselves. On most modern architectures, the `loadu` / `storeu` intrinsics should be equally as fast as `load` / `store` given that in both cases the blocks only span one cache line. The advantage of the latter is that they can act as free run time assertions that all reads and writes are aligned. + +This makes it important to properly [align](/hpc/cpu-cache/alignment) arrays and other data on allocation, and it is also one of the reasons why compilers can't always [auto-vectorize](../auto-vectorization) efficiently. For most purposes, we only need to guarantee that any 32-byte SIMD block will not cross a cache line boundary, and we can specify this alignment with the `alignas` specifier: + + + +```c++ +alignas(32) float a[n]; + +for (int i = 0; i < n; i += 8) { + __m256 x = _mm256_load_ps(&a[i]); + // ... +} +``` + +The [built-in vector types](../intrinsics) already have corresponding alignment requirements and assume aligned memory reads and writes — so you are always safe when allocating an array of `v8si`, but when converting it from `int*` you have to make sure it is aligned. + +Similar to the scalar case, many arithmetic instructions take memory addresses as operands — [vector addition](../intrinsics) is an example — although you can't explicitly use it as an intrinsic and have to rely on the compiler. There are also a few other instructions for reading a SIMD block from memory, notably the [non-temporal](/hpc/cpu-cache/bandwidth#bypassing-the-cache) load and store operations that don't lift accessed data in the cache hierarchy. + +### Register Aliasing + +The first SIMD extension, MMX, started quite small. It only used 64-bit vectors, which were conveniently aliased to the mantissa part of a [80-bit float](/hpc/arithmetic/ieee-754) so that there is no need to introduce a separate set of registers. As the vector size grew with later extensions, the same [register aliasing](/hpc/architecture/assembly#instructions-and-registers) mechanism used in general-purpose registers was adopted for the vector registers to maintain backward compatibility: `xmm0` is the first half (128 bits) of `ymm0`, `xmm1` is the first half of `ymm1`, and so on. + +This feature, combined with the fact that the vector registers are located in the FPU, makes moving data between them and the general-purpose registers slightly complicated. + +### Extract and Insert + +To *extract* a specific value from a vector, you can use `_mm256_extract_epi32` and similar intrinsics. It takes the index of the integer to be extracted as the second parameter and generates different instruction sequences depending on its value. + +If you need to extract the first element, it generates the `vmovd` instruction (for `xmm0`, the first half of the vector): + +```nasm +vmovd eax, xmm0 +``` + +For other elements of an SSE vector, it generates possibly slightly slower `vpextrd`: + +```nasm +vpextrd eax, xmm0, 1 +``` + +To extract anything from the second half of an AVX vector, it first has to extract that second half, and then the scalar itself. For example, here is how it extracts the last (eighth) element, + +```nasm +vextracti128 xmm0, ymm0, 0x1 +vpextrd eax, xmm0, 3 +``` + +There is a similar `_mm256_insert_epi32` intrinsic for overwriting specific elements: + +```nasm +mov eax, 42 + +; v = _mm256_insert_epi32(v, 42, 0); +vpinsrd xmm2, xmm0, eax, 0 +vinserti128 ymm0, ymm0, xmm2, 0x0 + +; v = _mm256_insert_epi32(v, 42, 7); +vextracti128 xmm1, ymm0, 0x1 +vpinsrd xmm2, xmm1, eax, 3 +vinserti128 ymm0, ymm0, xmm2, 0x1 +``` + +Takeaway: moving scalar data to and from vector registers is slow, especially when this isn't the first element. + +### Making Constants + +If you need to populate not just one element but the entire vector, you can use the `_mm256_setr_epi32` intrinsic: + +```c++ +__m256 iota = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); +``` + +The "r" here stands for "reversed" — from [the CPU point of view](/hpc/arithmetic/integer#integer-types), not for humans. There is also the `_mm256_set_epi32` (without "r") that fills the values from the opposite direction. Both are mostly used to create compile-time constants that are then fetched into the register with a block load. If your use case is filling a vector with zeros, use the `_mm256_setzero_si256` instead: it `xor`-s the register with itself. + +In built-in vector types, you can just use normal braced initialization: + +```c++ +vec zero = {}; +vec iota = {0, 1, 2, 3, 4, 5, 6, 7}; +``` + +### Broadcast + +Instead of modifying just one element, you can also *broadcast* a single value into all its positions: + +```nasm +; __m256i v = _mm256_set1_epi32(42); +mov eax, 42 +vmovd xmm0, eax +vpbroadcastd ymm0, xmm0 +``` + +This is a frequently used operation, so you can also use a memory location: + +```nasm +; __m256 v = _mm256_broadcast_ss(&a[i]); +vbroadcastss ymm0, DWORD PTR [rdi] +``` + +When using built-in vector types, you can create a zero vector and add a scalar to it: + +```c++ +vec v = 42 + vec{}; +``` + +### Mapping to Arrays + +If you want to avoid all this complexity, you can just dump the vector in memory and read its values back as scalars: + +```c++ +void print(__m256i v) { + auto t = (unsigned*) &v; + for (int i = 0; i < 8; i++) + std::cout << std::bitset<32>(t[i]) << " "; + std::cout << std::endl; +} +``` + +This may not be fast or technically legal (the C++ standard doesn't specify what happens when you cast data like this), but it is simple, and I frequently use this code to print out the contents of a vector during debugging. + + + +### Non-Contiguous Load + +Later SIMD extensions added special "gather" and "scatter instructions that read/write data non-sequentially using arbitrary array indices. These don't work 8 times faster though and are usually limited by the memory rather than the CPU, but they are still helpful for certain applications such as sparse linear algebra. + +Gather is available since AVX2, and various scatter instructions are available since AVX512. + +![](../img/gather-scatter.png) + +Let's see if they work faster than scalar reads. First, we create an array of size $N$ and $Q$ random read queries: + +```c++ +int a[N], q[Q]; + +for (int i = 0; i < N; i++) + a[i] = rand(); + +for (int i = 0; i < Q; i++) + q[i] = rand() % N; +``` + +In the scalar code, we add the elements specified by the queries to a checksum one by one: + +```c++ +int s = 0; + +for (int i = 0; i < Q; i++) + s += a[q[i]]; +``` + +And in the SIMD code, we use the `gather` instruction to do that for 8 different indexes in parallel: + +```c++ +reg s = _mm256_setzero_si256(); + +for (int i = 0; i < Q; i += 8) { + reg idx = _mm256_load_si256( (reg*) &q[i] ); + reg x = _mm256_i32gather_epi32(a, idx, 4); + s = _mm256_add_epi32(s, x); +} +``` + +They perform roughly the same, except when the array fits into the L1 cache: + +![](../img/gather.svg) + +The purpose of `gather` and `scatter` is not to perform memory operations faster, but to get the data into registers to perform heavy computations on them. For anything costlier than just one addition, they are hugely favorable. + +The lack of (fast) gather and scatter instructions makes SIMD programming on CPUs very different from proper parallel computing environments that support independent memory access. You have to always engineer around it and employ various ways of organizing your data sequentially so that it be loaded into registers. diff --git a/content/english/hpc/simd/permutation.md b/content/english/hpc/simd/permutation.md deleted file mode 100644 index 711aba60..00000000 --- a/content/english/hpc/simd/permutation.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -title: In-Register Shuffles -weight: 6 ---- - -Masking is the most widely used technique for data manipulation, but there are many other handy SIMD features that we will later use in this chapter: - -- You can [broadcast](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=6331,5160,588&techs=AVX,AVX2&text=broadcast) a single value to a vector from a register or a memory location. -- You can [permute](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=permute&techs=AVX,AVX2&expand=6331,5160) data inside a register almost arbitrarily. -- We can create tiny lookup tables with [pshufb](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=pshuf&techs=AVX,AVX2&expand=6331) instruction. This is useful when you have some logic that isn't implemented in SSE, and this operation is so instrumental in some algorithms that [Wojciech Muła](http://0x80.pl/) — the guy who came up with a half of the algorithms described in this chapter — took it as his [Twitter handle](https://twitter.com/pshufb) -- Since AVX2, you can use "gather" instructions that load data non-sequentially using arbitrary array indices. These don't work 8 times faster though and are usually limited by memory rather than CPU, but they are still helpful for stuff like sparse linear algebra. -- AVX512 has similar "scatter" instructions that write data non-sequentially, using either indices or [a mask](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=compress&expand=4754,4479&techs=AVX_512). You can very efficiently "filter" an array this way using a predicate. - -The last two, gather and scatter, turn SIMD into proper parallel programming model, where most operations can be executed independently in terms of their memory locations. This is a huge deal: many AVX512-specific algorithms have been developed recently owning to these new instructions, and not just having twice as many SIMD lanes. diff --git a/content/english/hpc/simd/reduction.md b/content/english/hpc/simd/reduction.md index 3dcb109c..89678103 100644 --- a/content/english/hpc/simd/reduction.md +++ b/content/english/hpc/simd/reduction.md @@ -1,9 +1,9 @@ --- -title: Sums and Other Reductions +title: Reductions weight: 3 --- -*Reduction* (also known as *folding* in functional programming) is the action of computing the value of some associative and commutative operation (i.e. $(a \circ b) \circ c = a \circ (b \circ c)$ and $a \circ b = b \circ a$) over a range of arbitrary elements. +*Reduction* (also known as *folding* in functional programming) is the action of computing the value of some associative and commutative operation (i.e., $(a \circ b) \circ c = a \circ (b \circ c)$ and $a \circ b = b \circ a$) over a range of arbitrary elements. The simplest example of reduction is calculating the sum an array: @@ -46,56 +46,64 @@ int sum_simd(v8si *a, int n) { } ``` -You can use this approach for for other reductions, such as for finding the minimum or the xor-sum of an array. - -### Horizontal Summation - -The last part, where we sum up the 8 accumulators stored in a vector register into a single scalar to get the total sum, is called "horizontal summation". - -Although extracting and adding every scalar one by one only takes a constant number of cycles, it can be computed slightly faster using a [special instruction](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX,AVX2&text=_mm256_hadd_epi32&expand=2941) that adds together pairs of adjacent elements in a register. - -![Horizontal summation in SSE/AVX. Note how the output is stored: the (a b a b) interleaving is common for reducing operations](../img/hsum.png) - -Since it is a very specific operation, it can only be done with SIMD intrinsics — although the compiler probably emits roughly the same procedure for the scalar code anyway: - -```c++ -int hsum(__m256i x) { - __m128i l = _mm256_extracti128_si256(x, 0); - __m128i h = _mm256_extracti128_si256(x, 1); - l = _mm_add_epi32(l, h); - l = _mm_hadd_epi32(l, l); - return _mm_extract_epi32(l, 0) + _mm_extract_epi32(l, 1); -} -``` - -There are [other similar instructions](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX,AVX2&ig_expand=3037,3009,5135,4870,4870,4872,4875,833,879,874,849,848,6715,4845&text=horizontal), e. g. for integer multiplication or calculating absolute differences between adjacent elements (used in image processing). +You can use this approach for other reductions, such as for finding the minimum or the xor-sum of an array. ### Instruction-Level Parallelism -Our implementation matches what the compiler produces automatically, but it is actually [suboptimal](/hpc/pipelining/throughput): when we use just one accumulator, we have to wait one cycle between the loop iterations for vector addition to complete, while its throughput is 2 on this microarchitecture. +Our implementation matches what the compiler produces automatically, but it is actually suboptimal: when we use just one accumulator, [we have to wait](/hpc/pipelining/throughput) one cycle between the loop iterations for a vector addition to complete, while the [throughput](/hpc/pipelining/tables/) of corresponding instruction is 2 on this microarchitecture. If we again divide the array in $B \geq 2$ parts and use a *separate* accumulator for each, we can saturate the throughput of vector addition and increase the performance twofold: ```c++ -const int B = 2; +const int B = 2; // how many vector accumulators to use int sum_simd(v8si *a, int n) { v8si b[B] = {0}; - for (int i = 0; i < n / 8; i += B) + for (int i = 0; i + (B - 1) < n / 8; i += B) for (int j = 0; j < B; j++) b[j] += a[i + j]; - + + // sum all vector accumulators into one for (int i = 1; i < B; i++) b[0] += b[i]; int s = 0; + // sum 8 scalar accumulators into one for (int i = 0; i < 8; i++) s += b[0][i]; + // add the remainder of a + for (int i = n / (8 * B) * (8 * B); i < n; i++) + s += a[i]; + return s; } ``` -If you have more than 2 relevant execution ports, you can increase `B` accordingly. But the n-fold performance increase will only apply to arrays that fit L1 cache — [memory bandwidth](/hpc/cpu-cache/bandwidth) will be the bottleneck for anything larger. +If you have more than 2 relevant execution ports, you can increase the `B` constant accordingly, but the $n$-fold performance increase will only apply to arrays that fit into L1 cache — [memory bandwidth](/hpc/cpu-cache/bandwidth) will be the bottleneck for anything larger. + +### Horizontal Summation + +The part where we sum up the 8 accumulators stored in a vector register into a single scalar to get the total sum is called "horizontal summation." + +Although extracting and adding every scalar one by one only takes a constant number of cycles, it can be computed slightly faster using a [special instruction](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#techs=AVX,AVX2&text=_mm256_hadd_epi32&expand=2941) that adds together pairs of adjacent elements in a register. + +![Horizontal summation in SSE/AVX. Note how the output is stored: the (a b a b) interleaving is common for reducing operations](../img/hsum.png) + +Since it is a very specific operation, it can only be done with SIMD intrinsics — although the compiler probably emits roughly the same procedure for the scalar code anyway: + +```c++ +int hsum(__m256i x) { + __m128i l = _mm256_extracti128_si256(x, 0); + __m128i h = _mm256_extracti128_si256(x, 1); + l = _mm_add_epi32(l, h); + l = _mm_hadd_epi32(l, l); + return _mm_extract_epi32(l, 0) + _mm_extract_epi32(l, 1); +} +``` + +There are [other similar instructions](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=AVX,AVX2&ig_expand=3037,3009,5135,4870,4870,4872,4875,833,879,874,849,848,6715,4845&text=horizontal), e.g., for integer multiplication or calculating absolute differences between adjacent elements (used in image processing). + +There is also one specific instruction, `_mm_minpos_epu16`, that calculates the horizontal minimum and its index among eight 16-bit integers. This is the only horizontal reduction that works in one go: all others are computed in multiple steps. diff --git a/content/english/hpc/simd/shuffling.md b/content/english/hpc/simd/shuffling.md new file mode 100644 index 00000000..6ff3b749 --- /dev/null +++ b/content/english/hpc/simd/shuffling.md @@ -0,0 +1,246 @@ +--- +title: In-Register Shuffles +weight: 6 +--- + +[Masking](../masking) lets you apply operations to only a subset of vector elements. It is a very effective and frequently used data manipulation technique, but in many cases, you need to perform more advanced operations that involve permuting values inside a vector register instead of just blending them with other vectors. + +The problem is that adding a separate element-shuffling instruction for each possible use case in hardware is unfeasible. What we can do though is to add just one general permutation instruction that takes the indices of a permutation and produces these indices using precomputed lookup tables. + +This general idea is perhaps too abstract, so let's jump straight to the examples. + +### Shuffles and Popcount + +*Population count*, also known as the *Hamming weight*, is the count of `1` bits in a binary string. + +It is a frequently used operation, so there is a separate instruction on x86 that computes the population count of a word: + +```c++ +const int N = (1<<12); +int a[N]; + +int popcnt() { + int res = 0; + for (int i = 0; i < N; i++) + res += __builtin_popcount(a[i]); + return res; +} +``` + +It also supports 64-bit integers, improving the total throughput twofold: + +```c++ +int popcnt_ll() { + long long *b = (long long*) a; + int res = 0; + for (int i = 0; i < N / 2; i++) + res += __builtin_popcountl(b[i]); + return res; +} +``` + +The only two instructions required are load-fused popcount and addition. They both have a high throughput, so the code processes about $8+8=16$ bytes per cycle as it is limited by the decode width of 4 on this CPU. + +These instructions were added to x86 CPUs around 2008 with SSE4. Let's temporarily go back in time before vectorization even became a thing and try to implement popcount by other means. + +The naive way is to go through the binary string bit by bit: + +```c++ +__attribute__ (( optimize("no-tree-vectorize") )) +int popcnt() { + int res = 0; + for (int i = 0; i < N; i++) + for (int l = 0; l < 32; l++) + res += (a[i] >> l & 1); + return res; +} +``` + +As anticipated, it works just slightly faster than ⅛-th of a byte per cycle — at around 0.2. + +We can try to process in bytes instead of individual bits by [precomputing](/hpc/compilation/precalc) a small 256-element *lookup table* that contains the population counts of individual bytes and then query it while iterating over raw bytes of the array: + +```c++ +struct Precalc { + alignas(64) char counts[256]; + + constexpr Precalc() : counts{} { + for (int m = 0; m < 256; m++) + for (int i = 0; i < 8; i++) + counts[m] += (m >> i & 1); + } +}; + +constexpr Precalc P; + +int popcnt() { + auto b = (unsigned char*) a; // careful: plain "char" is signed + int res = 0; + for (int i = 0; i < 4 * N; i++) + res += P.counts[b[i]]; + return res; +} +``` + +It now processes around 2 bytes per cycle, rising to ~2.7 if we switch to 16-bit words (`unsigned short`). + +This solution is still very slow compared to the `popcnt` instruction, but now it can be vectorized. Instead of trying to speed it up through [gather](../moving#non-contiguous-load) instructions, we will go for another approach: make the lookup table small enough to fit inside a register and then use a special [pshufb](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=pshuf&techs=AVX,AVX2&expand=6331) instruction to look up its values in parallel. + +The original `pshufb` introduced in 128-bit SSE3 takes two registers: the lookup table containing 16 byte values and a vector of 16 4-bit indices (0 to 15), specifying which bytes to pick for each position. In 256-bit AVX2, instead of a 32-byte lookup table with awkward 5-bit indices, we have an instruction that independently the same shuffling operation over two 128-bit lanes. + +So, for our use case, we create a 16-byte lookup table with population counts for each nibble (half-byte), repeated twice: + +```c++ +const reg lookup = _mm256_setr_epi8( + /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, + + /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 +); +``` + +Now, to compute the population count of a vector, we split each of its bytes into the lower and higher nibbles and then use this lookup table to retrieve their counts. The only thing left is to carefully sum them up: + +```c++ +const reg low_mask = _mm256_set1_epi8(0x0f); + +int popcnt() { + int k = 0; + + reg t = _mm256_setzero_si256(); + + for (; k + 15 < N; k += 15) { + reg s = _mm256_setzero_si256(); + + for (int i = 0; i < 15; i += 8) { + reg x = _mm256_load_si256( (reg*) &a[k + i] ); + + reg l = _mm256_and_si256(x, low_mask); + reg h = _mm256_and_si256(_mm256_srli_epi16(x, 4), low_mask); + + reg pl = _mm256_shuffle_epi8(lookup, l); + reg ph = _mm256_shuffle_epi8(lookup, h); + + s = _mm256_add_epi8(s, pl); + s = _mm256_add_epi8(s, ph); + } + + t = _mm256_add_epi64(t, _mm256_sad_epu8(s, _mm256_setzero_si256())); + } + + int res = hsum(t); + + while (k < N) + res += __builtin_popcount(a[k++]); + + return res; +} +``` + +This code processes around 30 bytes per cycle. Theoretically, the inner loop could do 32, but we have to stop it every 15 iterations because the 8-bit counters can overflow. + +The `pshufb` instruction is so instrumental in some SIMD algorithms that [Wojciech Muła](http://0x80.pl/) — the guy who came up with this algorithm — took it as his [Twitter handle](https://twitter.com/pshufb). You can calculate population counts even faster: check out his [GitHub repository](https://github.com/WojciechMula/sse-popcount) with different vectorized popcount implementations and his [recent paper](https://arxiv.org/pdf/1611.07612.pdf) for a detailed explanation of the state-of-the-art. + +### Permutations and Lookup Tables + +Our last major example in this chapter is the `filter`. It is a very important data processing primitive that takes an array as input and writes out only the elements that satisfy a given predicate (in their original order). + +In a single-threaded scalar case, it is trivially implemented by maintaining a counter that is incremented on each write: + +```c++ +int a[N], b[N]; + +int filter() { + int k = 0; + + for (int i = 0; i < N; i++) + if (a[i] < P) + b[k++] = a[i]; + + return k; +} +``` + +To vectorize it, we will use the `_mm256_permutevar8x32_epi32` intrinsic. It takes a vector of values and individually selects them with a vector of indices. Despite the name, it doesn't *permute* values but just *copies* them to form a new vector: duplicates in the result are allowed. + +The general idea of our algorithm is as follows: + +- calculate the predicate on a vector of data — in this case, this means performing the comparisons to get the mask; +- use the `movemask` instruction to get a scalar 8-bit mask; +- use this mask to index a lookup table that returns a permutation moving the elements that satisfy the predicate to the beginning of the vector (in their original order); +- use the `_mm256_permutevar8x32_epi32` intrinsic to permute the values; +- write the whole permuted vector to the buffer — it may have some trailing garbage, but its prefix is correct; +- calculate the population count of the scalar mask and move the buffer pointer by that number. + +First, we need to precompute the permutations: + +```c++ +struct Precalc { + alignas(64) int permutation[256][8]; + + constexpr Precalc() : permutation{} { + for (int m = 0; m < 256; m++) { + int k = 0; + for (int i = 0; i < 8; i++) + if (m >> i & 1) + permutation[m][k++] = i; + } + } +}; + +constexpr Precalc T; +``` + +Then we can implement the algorithm itself: + +```c++ +const reg p = _mm256_set1_epi32(P); + +int filter() { + int k = 0; + + for (int i = 0; i < N; i += 8) { + reg x = _mm256_load_si256( (reg*) &a[i] ); + + reg m = _mm256_cmpgt_epi32(p, x); + int mask = _mm256_movemask_ps((__m256) m); + reg permutation = _mm256_load_si256( (reg*) &T.permutation[mask] ); + + x = _mm256_permutevar8x32_epi32(x, permutation); + _mm256_storeu_si256((reg*) &b[k], x); + + k += __builtin_popcount(mask); + } + + return k; +} +``` + +The vectorized version takes some work to implement, but it is 6-7x faster than the scalar one (the speedup is slightly less for either low or high values of `P` as the [branch becomes predictable](/hpc/pipelining/branching)). + +![](../img/filter.svg) + +The loop performance is still relatively low — taking 4 CPU cycles per iteration — because, on this particular CPU (Zen 2), `movemask`, `permute`, and `store` have low throughput and all have to go through the same execution port (P2). On most other x86 CPUs, you can expect it to be ~2x faster. + +Filtering can also be implemented considerably faster on AVX-512: it has a special "[compress](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7395,7392,7269,4868,7269,7269,1820,1835,6385,5051,4909,4918,5051,7269,6423,7410,150,2138,1829,1944,3009,1029,7077,519,5183,4462,4490,1944,1395&text=_mm512_mask_compress_epi32)" instruction that takes a vector of data and a mask and writes its unmasked elements contiguously. It makes a huge difference in algorithms that rely on various filtering subroutines, such as quicksort. + + diff --git a/content/english/hpc/slides/01-intro/_index.md b/content/english/hpc/slides/01-intro/_index.md new file mode 100644 index 00000000..615a89aa --- /dev/null +++ b/content/english/hpc/slides/01-intro/_index.md @@ -0,0 +1,297 @@ +--- +title: Why Go Beyond Big O? +outputs: [Reveal] +--- + +# Performance Engineering + +Sergey Slotin + +$x + y$ + +May 7, 2022 + +--- + +### About me + +- Former [competitive programmer](https://codeforces.com/profile/sslotin) +- Created [Algorithmica.org](https://ru.algorithmica.org/cs) and "co-founded" [Tinkoff Generation](https://algocode.ru/) +- Wrote [Algorithms for Modern Hardware](https://en.algorithmica.org/hpc/), on which these lectures are based +- Twitter: [@sergey_slotin](https://twitter.com/sergey_slotin); Telegram: [@bydlokoder](https://t.me/bydlokoder); anywhere else: @sslotin + +---- + +### About this mini-course + +- Low-level algorithm optimization +- Two days, six lectures +- **Day 1:** CPU architecture & assembly, pipelining, SIMD programming +- **Day 2:** CPU caches & memory, binary search, tree data structures +- Prerequisites: CS 102, C/C++ +- No assignments, but you are encouraged to reproduce case studies: https://github.com/sslotin/amh-code + +--- + +## Lecture 0: Why Go Beyond Big O + +*(AMH chapter 1)* + +--- + +## The RAM Model of Computation + +- There is a set of *elementary operations* (read, write, add, multiply, divide) +- Each operation is executed sequentially and has some constant *cost* +- Running time ≈ sum of all elementary operations weghted by their costs + +---- + +![](https://en.algorithmica.org/hpc/complexity/img/cpu.png =400x) + +- The “elementary operations” of a CPU are called *instructions* +- Their “costs” are called *latencies* (measured in cycles) +- Instructions modify the state of the CPU stored in a number of *registers* +- To convert to real time, sum up all latencies of executed instructions and divide by the *clock frequency* (the number of cycles a particular CPU does per second) +- Clock speed is volatile, so counting cycles is more useful for analytical purposes + +---- + +![](https://external-preview.redd.it/6PIp0RLbdWFGFUOT6tFuufpMlplgWdnXWOmjuqkpMMU.jpg?auto=webp&s=9bed495f3dbb994d7cdda33cc114aba1cebd30e2 =400x) + +http://ithare.com/infographics-operation-costs-in-cpu-clock-cycles/ + +---- + +### Asymptotic complexity + +![](https://en.algorithmica.org/hpc/complexity/img/complexity.jpg =400x) + +For sufficiently large $n$, we only care about asymptotic complexity: $O(n) = O(1000 \cdot n)$ + +$\implies$ The costs of basic ops don't matter since they don't affect complexity + +But can we handle "sufficiently large" $n$? + +--- + +When complexity theory was developed, computers were different + +![](https://upload.wikimedia.org/wikipedia/commons/thumb/4/4e/Eniac.jpg/640px-Eniac.jpg =500x) + +Bulky, costly, and fundamentally slow (due to speed of light) + +---- + +![](https://researchresearch-news-wordpress-media-live.s3.eu-west-1.amazonaws.com/2022/02/microchip_fingertip-738x443.jpg =500x) + +Micro-scale circuits allow signals to propagate faster + +---- + + + +
+ +
+ +![](https://en.algorithmica.org/hpc/complexity/img/lithography.png =450x) + +
+ +
+ +Microchips are "printed" on a slice of silicon using a procees called [photolithography](https://en.wikipedia.org/wiki/Photolithography): + +1. grow and slice a [very pure silicon crystal](https://en.wikipedia.org/wiki/Wafer_(electronics)) +2. cover it with a layer of [photoresist](https://en.wikipedia.org/wiki/Photoresist) +3. hit it with photons in a set pattern +4. chemically [etch](https://en.wikipedia.org/wiki/Etching_(microfabrication)) the exposed parts +5. remove the remaining photoresist + +(…plus another 40-50 steps over several months to complete the rest of the CPU) + +
+ +
+ +---- + +The development of microchips and photolithography enabled: + +- higher clock rates +- the ability to scale the production +- **much** lower material and power usage (= lower cost) + +---- + +![](https://upload.wikimedia.org/wikipedia/commons/4/49/MOS_6502AD_4585_top.jpg =500x) + +MOS Technology 6502 (1975), Atari 2600 (1977), Apple II (1977), Commodore 64 (1982) + +---- + +Also a clear path to improvement: just make lenses stronger and chips smaller + +**Moore’s law:** transistor count doubles every two years. + +---- + +**Dennard scaling:** reducing die dimensions by 30% + +- doubles the transistor density ($0.7^2 \approx 0.5$) +- increases the clock speed by 40% ($\frac{1}{0.7} \approx 1.4$) +- leaves the overall *power density* the same + (we have a mechanical limit on how much heat can be dissipated) + +$\implies$ Each new "generation" should have roughly the same total cost, but 40% higher clock and twice as many transistors + +(which can be used, e.g., to add new instructions or increase the word size) + +---- + +Around 2005, Dennard scaling stopped — due to *leakage* issues: + +- transistors became very smal +- $\implies$ their magnetic fields started to interfere with the neighboring circuitry +- $\implies$ unnecessary heating and occasional bit flipping +- $\implies$ have to increase voltage to fix it +- $\implies$ have to reduce clock frequency to balance off power consumption + +---- + +![](https://en.algorithmica.org/hpc/complexity/img/dennard.ppm =600x) + +A limit on the clock speed + +--- + +Clock rates have plateaued, but we still have more transistors to use: + +- **Pipelining:** overlapping the execution of sequential instructions to keep different parts of the CPU busy +- **Out-of-order execution:** no waiting for the previous instructions to complete +- **Superscalar processing:** adding duplicates of execution units +- **Caching:** adding layers of faster memory on the chip to speed up RAM access +- **SIMD:** adding instructions that handle a block of 128, 256, or 512 bits of data +- **Parallel computing:** adding multiple identinal cores on a chip +- **Distributed computing:** multiple chips in a motherboard or multiple computers +- **FPGAs** and **ASICs:** using custom hardware to solve a specific problem + +---- + +![](https://en.algorithmica.org/hpc/complexity/img/die-shot.jpg =500x) + +For modern computers, the “let’s count all operations” approach for predicting algorithm performance is off by several orders of magnitude + +--- + +### Matrix multiplication + +```python +n = 1024 + +a = [[random.random() + for row in range(n)] + for col in range(n)] + +b = [[random.random() + for row in range(n)] + for col in range(n)] + +c = [[0 + for row in range(n)] + for col in range(n)] + +for i in range(n): + for j in range(n): + for k in range(n): + c[i][j] += a[i][k] * b[k][j] +``` + +630 seconds or 10.5 minutes to multiply two $1024 \times 1024$ matrices in plain Python + +~880 cycles per multiplication + +---- + +```java +public class Matmul { + static int n = 1024; + static double[][] a = new double[n][n]; + static double[][] b = new double[n][n]; + static double[][] c = new double[n][n]; + + public static void main(String[] args) { + Random rand = new Random(); + + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + a[i][j] = rand.nextDouble(); + b[i][j] = rand.nextDouble(); + c[i][j] = 0; + } + } + + for (int i = 0; i < n; i++) + for (int j = 0; j < n; j++) + for (int k = 0; k < n; k++) + c[i][j] += a[i][k] * b[k][j]; + } +} +``` + +Java needs 10 seconds, 63 times faster + +~13 cycles per multiplication + +---- + +```c +#define n 1024 +double a[n][n], b[n][n], c[n][n]; + +int main() { + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + a[i][j] = (double) rand() / RAND_MAX; + b[i][j] = (double) rand() / RAND_MAX; + } + } + + for (int i = 0; i < n; i++) + for (int j = 0; j < n; j++) + for (int k = 0; k < n; k++) + c[i][j] += a[i][k] * b[k][j]; + + return 0; +} +``` + +`GCC -O3` needs 9 seconds, but if we include `-march=native` and `-ffast-math`, the compiler vectorizes the code, and it drops down to 0.6s. + +---- + +```python +import time +import numpy as np + +n = 1024 + +a = np.random.rand(n, n) +b = np.random.rand(n, n) + +start = time.time() + +c = np.dot(a, b) + +duration = time.time() - start +print(duration) +``` + +BLAS needs ~0.12 seconds +(~5x over auto-vectorized C and ~5250x over plain Python) diff --git a/content/english/hpc/slides/_index.md b/content/english/hpc/slides/_index.md new file mode 100644 index 00000000..794e67a6 --- /dev/null +++ b/content/english/hpc/slides/_index.md @@ -0,0 +1,10 @@ +--- +title: Slides +ignoreIndexing: true +weight: 1000 +draft: true +--- + +This is an attempt to make a university course out of the book. + +Work in progress. diff --git a/content/english/hpc/stats.md b/content/english/hpc/stats.md index 2961f4d5..15d81e39 100644 --- a/content/english/hpc/stats.md +++ b/content/english/hpc/stats.md @@ -18,7 +18,7 @@ A **random variable** is any variable whose value depends on an outcome of a ran 2. $\forall x \in X, 0 \leq P \leq 1$. 3. $\sum_{x \in X} P(x) = 1$. -For example, consider a random variable $X$ with $k$ discrete states (e. g. the result of a die toss). We can place a *uniform distribution* on $X$ — that is, make each of its states equally likely — by setting its probability distribution to: +For example, consider a random variable $X$ with $k$ discrete states (e.g., the result of a die toss). We can place a *uniform distribution* on $X$ — that is, make each of its states equally likely — by setting its probability distribution to: $$ P(x=x_i) = \frac{1}{k} @@ -121,7 +121,7 @@ The last transition is true because it is a sum of harmonic series. ### Order Statistics -There is a slight modification of quicksort called quickselect that allows finding the $k$-th smallest element in $O(n)$ time, which is useful when we need to quickly compute order statistics, e. g. medians or 75-th quantiles. +There is a slight modification of quicksort called quickselect that allows finding the $k$-th smallest element in $O(n)$ time, which is useful when we need to quickly compute order statistics; e.g., medians or 75-th quantiles. 1. Select a random element $p$ from the array. 2. Partition the array into two arrays $L$ and $R$ using the predicate $a_i > p$. @@ -193,7 +193,7 @@ f(n, m) &= 1 \times (1-\frac{1}{m}) \times (1-\frac{2}{m}) \times ... \times (1- \end{aligned} $$ -This product shrinks pretty quickly with $n$, but it is not clear what value of $m$ is needed to be "safe". Turns out, if $n = O(\sqrt m)$, the probability of collision tends to zero, and anything asymptotically larger guarantees a collision. One can show this with calculus, but we will choose the probability theory way. +This product shrinks pretty quickly with $n$, but it is not clear what value of $m$ is needed to be "safe." Turns out, if $n = O(\sqrt m)$, the probability of collision tends to zero, and anything asymptotically larger guarantees a collision. One can show this with calculus, but we will choose the probability theory way. Let's go back to the idea of counting pairs of birthdays and introduce $\frac{n \cdot (n-1)}{2}$ indicators $I_{ij}$ — one for each pair $(i, j)$ of persons — each being equal to $1$ if the birthdays match. The probability and expectation of each indicator is $\frac{1}{m}$. diff --git a/content/russian/contributing.md b/content/russian/contributing.md index e7fb62ee..c33a6b1f 100644 --- a/content/russian/contributing.md +++ b/content/russian/contributing.md @@ -1,9 +1,10 @@ --- title: Как добавлять и редактировать статьи authors: -- Сергей Слотин -date: 2021-09-30 + - Сергей Слотин +date: 2021-01-23 hideSidebar: true +published: true --- Неполные гайдлайны, которые постепенно будут пополняться. @@ -14,9 +15,7 @@ hideSidebar: true ### Если у меня маленькая правка -Нужно нажать на кнопку с карандашом сверху справа. Откроется интерфейс prose.io, в котором нужно залогиниться через github, после чего можно редактировать markdown-исходник страницы. - -При первом сохранении автоматически создастся ветка и pull request от вашего имени, и при дальнейших он будет обновляться. Когда закончили, оставьте как есть — кто-нибудь придет и апрувнет. +На любой странице сайта можно нажать кнопку с карандашом сверху справа. Откроется интерфейс prose.io, в котором нужно залогиниться через GitHub, после чего можно редактировать markdown-исходник страницы. При первом сохранении автоматически создастся ветка и pull request от вашего имени, и при дальнейших он будет обновляться. Когда закончили, оставьте как есть — кто-нибудь придет и апрувнет. Полного preview там нет — осторожнее с правкой сложных формул, если не уверены в корректности. @@ -24,9 +23,7 @@ hideSidebar: true ### Если у меня большая правка -Для чего-либо серьёзного рекомендуется счекаутить репозиторий и поднять сайт локально. - -Это можно сделать так (предполагается, что вы знакомы с работой в терминале): +Для чего-либо серьёзного рекомендуется счекаутить репозиторий и поднять сайт локально. Это можно сделать так (предполагается, что вы знакомы с работой в терминале): 1. [Поставить Hugo](https://gohugo.io/getting-started/installing/): скорее всего одно из `sudo apt-get install hugo`, `sudo pacman -Syu hugo`, `brew install hugo` или `choco install hugo -confirm` в зависимости от системы. 2. Форкнуть репозиторий и сделать `git clone https://github.com/$USERNAME/algorithmica.git`. @@ -56,7 +53,7 @@ hideSidebar: true [Гайд по синтаксису](https://www.markdownguide.org/basic-syntax/). -Помимо основного синтаксиса, поддерживаются ещё таблицы, блоки кода, strikethrough, latex (через один или два `$`) и tikz (через две `@`). +Помимо основного синтаксиса, поддерживаются ещё таблицы, блоки кода, strikethrough, latex-формулы (через один или два `$`) и tikz-диаграммы (через две `@`). ### Front matter @@ -73,7 +70,7 @@ hideSidebar: true ## Правила русского языка -Ревьюер всё равно поправит, но пожалуйста, имейте в виду: +Ревьюер всё равно поправит, но, пожалуйста, имейте в виду: 1. Кавычки: « и ». 2. [Дефисы, минусы и тире](https://www.artlebedev.ru/kovodstvo/sections/97/): -, $a-b$ (через latex) и —. diff --git a/content/russian/cs/algebra/binpow.md b/content/russian/cs/algebra/binpow.md index 5c7d2d43..4126061d 100644 --- a/content/russian/cs/algebra/binpow.md +++ b/content/russian/cs/algebra/binpow.md @@ -6,7 +6,7 @@ authors: weight: -10 --- -*Бинарное возведение в степень* — приём, позволяющий возводить любое число в $n$-ую степень за $O(\log n)$ умножений (вместо n умножений при обычном подходе). +*Бинарное возведение в степень* — приём, позволяющий возводить любое число в $n$-ую степень за $O(\log n)$ умножений (вместо $n$ умножений при обычном подходе). ## Основная идея diff --git a/content/russian/cs/algebra/matmul.md b/content/russian/cs/algebra/matmul.md index bc5ca593..8a633bea 100644 --- a/content/russian/cs/algebra/matmul.md +++ b/content/russian/cs/algebra/matmul.md @@ -188,7 +188,7 @@ matrix binpow(matrix a, int p) { Эту технику можно применить и к другим динамикам, где нужно посчитать количество способов что-то сделать — иногда очень неочевидными способами. -Например, можно решить такую задачу: найти количество строк длины $k \approx 10^{18}$, не содержащих данные маленькие запрещённые подстроки. Для этого нужно построить граф «легальных» переходов в [Ахо-Корасике](/cs/automata/aho-corasick), возвести его матрицу смежности в $k$-тую степень и просуммировать в нём первую строчку. +Например, можно решить такую задачу: найти количество строк длины $k \approx 10^{18}$, не содержащих данные маленькие запрещённые подстроки. Для этого нужно построить граф «легальных» переходов в [Ахо-Корасике](/cs/string-structures/aho-corasick), возвести его матрицу смежности в $k$-тую степень и просуммировать в нём первую строчку. В некоторых изощрённых случаях в матричном умножении вместо умножения и сложения нужно использовать другие операции, которые ведут себя как умножение и сложение. Пример задачи: «найти путь от $s$ до $t$ с минимальным весом ребра, использующий ровно $k$ переходов»; здесь нужно возводить в $(k-1)$-ую степень матрицу весов графа, и вместо и сложения, и умножения использовать минимум из двух весов. diff --git a/content/russian/cs/basic-structures/dequeue.md b/content/russian/cs/basic-structures/deque.md similarity index 88% rename from content/russian/cs/basic-structures/dequeue.md rename to content/russian/cs/basic-structures/deque.md index 699cc5f1..fb4f800a 100644 --- a/content/russian/cs/basic-structures/dequeue.md +++ b/content/russian/cs/basic-structures/deque.md @@ -4,11 +4,11 @@ weight: 7 draft: true --- -`dequeue` - структура, позволяющая работать и с началом и концом +`deque` - структура, позволяющая работать и с началом и концом одновременно, то есть вставка и удаление с двух сторон ``` C++ -dequeue name; // дек типа T с названием name +deque name; // дек типа T с названием name name.front(), name.back(); // ссылка на первый и последний элемент соответственно name.pop_front(), name.pop_back(); // удаление первого и последнего элемента name.push_front(x), name.push_back(x); // вставка x в начало/конец diff --git a/content/russian/cs/basic-structures/iterators.md b/content/russian/cs/basic-structures/iterators.md index b2d8269f..c048e0b6 100644 --- a/content/russian/cs/basic-structures/iterators.md +++ b/content/russian/cs/basic-structures/iterators.md @@ -71,7 +71,7 @@ for (int x : c) ### Алгоритмы из STL -Например, итераторы `std::vector` относятся к `random_access_iterator`, и если вызвать функцию `lower_bound` из стандартной библиотеки, то она произведет [бинарный поиск](../../ordered-search/binary-search) по элементам (предполагая, что они отсортированы в порядке неубывания): +Например, итераторы `std::vector` относятся к `random_access_iterator`, и если вызвать функцию `lower_bound` из стандартной библиотеки, то она произведет [бинарный поиск](/cs/interactive/binary-search/) по элементам (предполагая, что они отсортированы в порядке неубывания): ```cpp vector a = {1, 2, 3, 5, 8, 13}; @@ -93,4 +93,4 @@ array a = {4, 2, 1, 3}; cout << *min_element(a.begin(), a.end()) << endl; ``` -Подробнее про разные полезные алгоритмы STL можно прочитать в [ликбезе по C++](../../programming/cpp). + diff --git a/content/russian/cs/basic-structures/stack-minima.md b/content/russian/cs/basic-structures/stack-minima.md index 174f8c3a..3cebaf81 100644 --- a/content/russian/cs/basic-structures/stack-minima.md +++ b/content/russian/cs/basic-structures/stack-minima.md @@ -36,7 +36,7 @@ minima = st.top().second; Рассмотрим реализацию вышеописанных операций: -dequeue q; +deque q; Нахождение минимума: current_minimum = q.front(); Добавление элемента: diff --git a/content/russian/cs/complexity/asymptotic.md b/content/russian/cs/complexity/asymptotic.md index c8c27c73..b46e27ee 100644 --- a/content/russian/cs/complexity/asymptotic.md +++ b/content/russian/cs/complexity/asymptotic.md @@ -1,6 +1,7 @@ --- title: Асимптотический анализ weight: 2 +published: true --- Часто бывает полезно оценить, сколько времени работает алгоритм. Конечно, можно его просто реализовать и запустить, но тут возникают проблемы: @@ -18,7 +19,7 @@ weight: 2 При этом важно не просто считать строчки, а ещё учитывать, как реализованы некоторые отдельные вещи в самом языке. Например, в питоне срезы массива (`array[3:10]`) копируют этот массив, то есть этот срез работает за 7 элементарных действий. А `swap`, например, можно реализовать за 3 присваивания. -**Упражнение.** Попробуйте посчитать точное число *сравнений* и *присваиваний* в [сортировках](../sorting) пузырьком, выбором, вставками и подсчетом в худшем случае. Это должна быть какая-то формула, зависящая от $n$ — длины массива. +**Упражнение.** Попробуйте посчитать точное число *сравнений* и *присваиваний* в [сортировках](../../sorting) пузырьком, выбором, вставками и подсчетом в худшем случае. Это должна быть какая-то формула, зависящая от $n$ — длины массива. Чтобы учесть вообще все элементарные операции, ещё надо посчитать, например, сколько раз прибавилась единичка внутри цикла `for`. А ещё, например, строчка `n = len(array)` — это тоже действие. Поэтому даже посчитав их, не сразу очевидно, какой из этих алгоритмов работает быстрее — сравнивать формулы сложно. Хочется придумать способ упростить эти формулы так, чтобы @@ -28,7 +29,7 @@ weight: 2 Для этого придумали О-нотацию — асимптотическое время работы вместо точного (часто его ещё называют просто *асимптотикой*). -**Определение.** Пусть $f(n)$ — это какая-то функция. Говорят, что функция $g(n) = O(f(n))$, если существует такие константы $c$ и $n_0$, что $g(n) < c \cdot g(n)$ для всех $n \geq n_0$. +**Определение.** Пусть $f(n)$ — это какая-то функция. Говорят, что функция $g(n) = O(f(n))$, если существует такие константы $c$ и $n_0$, что $g(n) < c \cdot f(n)$ для всех $n \geq n_0$. Например: diff --git a/content/russian/cs/convex-hulls/graham.md b/content/russian/cs/convex-hulls/graham.md index 9736c034..49138c7e 100644 --- a/content/russian/cs/convex-hulls/graham.md +++ b/content/russian/cs/convex-hulls/graham.md @@ -23,7 +23,7 @@ vector graham(vector points) { // сортируем точки по полярному углу sort(points.begin(), points.end(), [&](r a, r b){ - return (a - p) ^ (b - p) > 0; + return (a - p0) ^ (b - p0) > 0; }); vector hull; diff --git a/content/russian/cs/decomposition/scanline.md b/content/russian/cs/decomposition/scanline.md index 1b3cf993..3bc99afd 100644 --- a/content/russian/cs/decomposition/scanline.md +++ b/content/russian/cs/decomposition/scanline.md @@ -1,14 +1,15 @@ --- title: Сканирующая прямая authors: -- Сергей Слотин + - Сергей Слотин prerequisites: -- /cs/range-queries -- /cs/segment-tree + - /cs/range-queries + - /cs/segment-tree weight: 1 +published: true --- -Метод сканирующей прямой (англ. *scanline*) заключается в сортировке точек или каких-то абстрактных *событий* (англ. *event*) и последующему проходу по ним. +Метод сканирующей прямой (англ. *scanline*) заключается в сортировке точек на координатной прямой либо каких-то абстрактных «событий» по какому-то признаку и последующему проходу по ним. Он часто используется для решения задач на структуры данных, когда все запросы известны заранее, а также в геометрии для нахождения объединений фигур. @@ -20,9 +21,9 @@ weight: 1 Назовем *интересными* те точки, в которых происходит смена количества отрезков, которыми она покрыта. Так как смена ответа может происходить только в интересной точке, то максимум достигается также в какой-то из интересных точек. Отсюда сразу следует решение за $O(n^2)$: просто перебрать все интересные точки (это будут концы заданных отрезков) и проверить для каждой по отдельности ответ. -Это решение можно улучшить. Отсортируем интересные точки по возрастанию координаты и прой по ним слева направо, поддерживая количество отрезков `cnt`, которые покрывают данную точку. Если в данной точке начинается отрезок, то надо увеличить `cnt` на единицу, а если заканчивается, то уменьшить. После этого пробуем обновить ответ на задачу текущим значением `cnt`. +Это решение можно улучшить. Отсортируем интересные точки по возрастанию координаты и пройдем по ним слева направо, поддерживая количество отрезков `cnt`, которые покрывают данную точку. Если в данной точке начинается отрезок, то надо увеличить `cnt` на единицу, а если заканчивается, то уменьшить. После этого пробуем обновить ответ на задачу текущим значением `cnt`. -Как такое писать: нужно представить интересные точки в виде структур с полями «координата» и «тип» (начало / конец) и отсортировать со своим компаратором. Удобно начало отрезка обозначать +1, а конец -1, чтобы просто прибавлять к `cnt` это значение и на разбирать случае. +Как такое писать: нужно представить интересные точки в виде структур с полями «координата» и «тип» (начало / конец) и отсортировать со своим компаратором. Удобно начало отрезка обозначать +1, а конец -1, чтобы просто прибавлять к `cnt` это значение и не разбивать на случаи. Единственный нюанс — если координаты двух точек совпали, чтобы получить правильный ответ, сначала надо рассмотреть все начала отрезков, а только потом концы (чтобы при обновлении ответа в этой координате учлись и правые, и левые граничные отрезки). @@ -62,15 +63,15 @@ int scanline(vector> segments) { **Задача.** Дан набор из $n$ отрезков на прямой, заданных координатами начал и концов $[l_i, r_i]$. Требуется найти суммарную длину их объединения. -Как и в прошлой задаче, отсортируем интересные точки и при проходе будем поддерживать число отрезков, покрывающих данную точку. Если оно больше 0, то отрезок который мы прошли с прошлой рассмотренной точки принадлежит объединению, и его длину нужно прибавить к ответу: +Как и в прошлой задаче, отсортируем все интересные точки и при проходе будем поддерживать число отрезков, покрывающих текущую точку. Если оно больше 0, то отрезок, который мы прошли с прошлой рассмотренной точки, принадлежит объединению, и его длину нужно прибавить к ответу: ```cpp int cnt = 0, res = 0, prev = -inf; for (event e : events) { - cnt += e.type; if (prev != -inf && cnt > 0) - res += prev - e.x; + res += e.x - prev; // весь отрезок [prev, e.x] покрыт cnt отрезками + cnt += e.type; prev = e.x; } ``` @@ -83,7 +84,7 @@ for (event e : events) { Воспользуемся следующим приемом: сразу считаем все запросы и сохраним их, чтобы потом ответить на все сразу. Добавим точки запросов как события с новым типом 0, который будет означать, что в этой точке надо ответить на запрос, и отдельным полем для номера запроса. -Теперь аналогично отсортируем отсортируем точки интереса и пройдем по ним слева направо, поддерживая `cnt` и отвечая на запросы, когда их встретим. +Теперь аналогично отсортируем точки интереса и пройдем по ним слева направо, поддерживая `cnt` и отвечая на запросы, когда их встретим. ```cpp struct event { diff --git a/content/russian/cs/factorization/eratosthenes.md b/content/russian/cs/factorization/eratosthenes.md index 02e72c0e..acf47749 100644 --- a/content/russian/cs/factorization/eratosthenes.md +++ b/content/russian/cs/factorization/eratosthenes.md @@ -12,10 +12,10 @@ published: true Основная идея соответствует названию алгоритма: запишем ряд чисел $1, 2,\ldots, n$, а затем будем вычеркивать -* сначала числа, делящиеся на $2$, кроме самого числа $2$, -* потом числа, делящиеся на $3$, кроме самого числа $3$, -* с числами, делящимися на $4$, ничего делать не будем — мы их уже вычёркивали, -* потом продолжим вычеркивать числа, делящиеся на $5$, кроме самого числа $5$, +- сначала числа, делящиеся на $2$, кроме самого числа $2$, +- потом числа, делящиеся на $3$, кроме самого числа $3$, +- с числами, делящимися на $4$, ничего делать не будем — мы их уже вычёркивали, +- потом продолжим вычеркивать числа, делящиеся на $5$, кроме самого числа $5$, …и так далее. @@ -23,10 +23,10 @@ published: true ```c++ vector sieve(int n) { - vector is_prime(n+1, true); + vector is_prime(n + 1, true); for (int i = 2; i <= n; i++) if (is_prime[i]) - for (int j = 2*i; j <= n; j += i) + for (int j = 2 * i; j <= n; j += i) is_prime[j] = false; return is_prime; } @@ -49,7 +49,6 @@ $$ У исходного алгоритма асимптотика должна быть ещё лучше. Чтобы найти её точнее, нам понадобятся два факта про простые числа: 1. Простых чисел от $1$ до $n$ примерно $\frac{n}{\ln n}$ . - 2. Простые числа распределены без больших «разрывов» и «скоплений», то есть $k$-тое простое число примерно равно $k \ln k$. Мы можем упрощённо считать, что число $k$ является простым с «вероятностью» $\frac{1}{\ln n}$. Тогда, время работы алгоритма можно более точнее оценить как @@ -65,11 +64,11 @@ $$ ## Линейное решето -Основная проблема решета Эратосфена состоит в том, что некоторые числа мы будем помечать как составные несколько раз — а именно столько раз, сколько у них различных простых делителей. Чтобы достичь линейного времени работы, нам нужно придумать способ, как рассматривать все составные числа ровно один раз. +Основная проблема решета Эратосфена состоит в том, что некоторые числа мы будем помечать как составные несколько раз — столько, сколько у них различных простых делителей. Чтобы достичь линейного времени работы, нам нужно придумать способ, как рассматривать все составные числа ровно один раз. Обозначим за $d(k)$ минимальный простой делитель числа $k$ и заметим следующий факт: у составного числа $k$ есть единственное представление $k = d(k) \cdot r$, и при этом у числа $r$ нет простых делителей меньше $d(k)$. -Идея оптимизации состоит в том, чтобы перебирать этот $r$, и для каждого перебирать только нужные множители — а именно все от $2$ до $d(r)$ включительно. +Идея оптимизации состоит в том, чтобы перебирать этот $r$, и для каждого перебирать только нужные множители — а именно, все от $2$ до $d(r)$ включительно. ### Алгоритм diff --git a/content/russian/cs/geometry-basic/polygons.md b/content/russian/cs/geometry-basic/polygons.md index 7537e591..e0a3c5e7 100644 --- a/content/russian/cs/geometry-basic/polygons.md +++ b/content/russian/cs/geometry-basic/polygons.md @@ -80,7 +80,7 @@ $$ В более общем случае есть два популярных подхода, оба за $O(n)$. -Первый заключается в подсчете углов. Пройдемся по всем вершинам в порядке обхода и будем последовательно рассматривать углы с вершиной в точке $P$ и лучами, проходящими через соседние вершины многоугольника. Если просуммировать эти ориентированные углы, то получится какая-то величина $\theta$. Если точка $P$ лежит внутри многоугольника, то $\theta = \pm 2 \theta$, иначе $\theta = 0$. +Первый заключается в подсчете углов. Пройдемся по всем вершинам в порядке обхода и будем последовательно рассматривать углы с вершиной в точке $P$ и лучами, проходящими через соседние вершины многоугольника. Если просуммировать эти ориентированные углы, то получится какая-то величина $\theta$. Если точка $P$ лежит внутри многоугольника, то $\theta = \pm 2 \pi$, иначе $\theta = 0$. Второй заключается в подсчете, сколько раз луч, выпущенный из $P$, пересекает ребра многоугольника. diff --git a/content/russian/cs/geometry-basic/products.md b/content/russian/cs/geometry-basic/products.md index a4e1a3d5..488dbca6 100644 --- a/content/russian/cs/geometry-basic/products.md +++ b/content/russian/cs/geometry-basic/products.md @@ -1,6 +1,7 @@ --- title: Скалярное и векторное произведение weight: 2 +published: true --- Помимо очевидных сложения, вычитания и умножения на константу, у векторов можно ввести и свои особенные операции, которые нам упростят жизнь. @@ -40,9 +41,9 @@ $$ a \times b = |a| \cdot |b| \cdot \sin \theta = x_a y_b - y_a x_b $$ -Так же, как и со скалярным произведением, доказательство координатной формулы оставляется упражнением читателю. Если кто-то захочет это сделать: это следует из линейности обоих произведений (что в свою очередь тоже нужно доказать) и разложения и разложения по базисным векторам $\overline{(0, 1)}$ и $\overline{(1, 0)}$. +Так же, как и со скалярным произведением, доказательство координатной формулы оставляется упражнением читателю. Если кто-то захочет это сделать: это следует из линейности обоих произведений (что в свою очередь тоже нужно доказать) и разложения по базисным векторам $\overline{(0, 1)}$ и $\overline{(1, 0)}$. -Геометрически, это ориентированный объем параллелограмма, натянутого на вектора $a$ и $b$: +Геометрически, это ориентированная площадь параллелограмма, натянутого на вектора $a$ и $b$: ![](../img/cross.jpg) @@ -65,7 +66,7 @@ int operator^(r a, r b) { return a.x*b.y - b.x*a.y; } Скалярное и векторное произведения тесно связаны с углами между векторами и могут использоваться для подсчета величин вроде ориентированных углов и площадей, которые обычно используются для разных проверок. -Когда они уже реализованы, использовать произведения гораздо проще, чем опираться на алгебру. Например, можно легко угол между двумя векторами, подставив в знакомый нам `atan2` векторное и скалярное произведение: +Когда они уже реализованы, использовать произведения гораздо проще, чем опираться на алгебру. Например, можно легко вычислить угол между двумя векторами, подставив в знакомый нам `atan2` векторное и скалярное произведение: ```c++ double angle(r a, r b) { diff --git a/content/russian/cs/geometry-basic/segments.md b/content/russian/cs/geometry-basic/segments.md index 80734b25..2b686bbc 100644 --- a/content/russian/cs/geometry-basic/segments.md +++ b/content/russian/cs/geometry-basic/segments.md @@ -1,6 +1,7 @@ --- title: Прямые и отрезки weight: 3 +published: true --- Отрезок можно задать двумя точками своих концов. В любом порядке — ведь он, в отличие от вектора, неориентирован. @@ -76,14 +77,21 @@ $$ Об этой формуле можно думать как о скалярном произведении вектора-точки на нормированный ($\frac{1}{\sqrt{A^2+B^2}}$) вектор нормали, геометрически равный проекции точки на него. -Если же прямая задана 2 точками, то можно сделать так: +Если же прямая задана 2 точками, то можно выразить высоту из формулы для площади треугольника: $$ -\rho(P, L(A, B)) = \frac{ \overrightarrow{PA} \cdot -\overrightarrow{PB}}{|\overrightarrow{(A, B)}|} +A = \frac{1}{2} bh +\implies +h = \frac{2A}{b} $$ -Обратите внимание, что в знаменателе стоит скалярное произведение. +И посчитать эту высоту так: + +$$ +\rho(P, L(A, B)) = \frac{|\overrightarrow{PA} \times \overrightarrow{PB}|}{|\overrightarrow{AB}|} +$$ + +Обратите внимание, что в числителе стоит [векторное произведение](../products) — мы воспользовались тем, что по модулю оно равно удвоенной площади треугольника $\angle PAB$, ### Точка пересечения прямых diff --git a/content/russian/cs/geometry-basic/vectors.md b/content/russian/cs/geometry-basic/vectors.md index 05051396..ee1a052a 100644 --- a/content/russian/cs/geometry-basic/vectors.md +++ b/content/russian/cs/geometry-basic/vectors.md @@ -1,6 +1,7 @@ --- -title: Точки и векторы +title: Точки и вектора weight: 1 +published: true --- Отрезок, для которого указано, какой из его концов считается началом, а какой концом, называется *вектором*. Вектор на плоскости можно задать двумя числами — его координатами по горизонтали и вертикали. diff --git a/content/russian/cs/graph-traversals/bridges.md b/content/russian/cs/graph-traversals/bridges.md index 662fafdb..54bd63e4 100644 --- a/content/russian/cs/graph-traversals/bridges.md +++ b/content/russian/cs/graph-traversals/bridges.md @@ -1,6 +1,7 @@ --- title: Мосты и точки сочленения weight: 6 +published: true --- **Определение.** *Мостом* называется ребро, при удалении которого связный неориентированный граф становится несвязным. @@ -79,6 +80,7 @@ void dfs(int v, int p = -1) { void dfs(int v, int p = -1) { used[v] = 1; d[v] = h[v] = (p == -1 ? 0 : h[p] + 1); + int children = 0; for (int u : g[v]) { if (u != p) { if (used[u]) @@ -90,10 +92,11 @@ void dfs(int v, int p = -1) { // v -- точка сочленения // (это условие может выполниться много раз для разных детей) } + children++; } } } - if (p == -1 && g[v].size() > 1) { + if (p == -1 && children > 1) { // v -- корень и точка сочленения } } diff --git a/content/russian/cs/graph-traversals/connectivity.md b/content/russian/cs/graph-traversals/connectivity.md index 45ceec28..17628308 100644 --- a/content/russian/cs/graph-traversals/connectivity.md +++ b/content/russian/cs/graph-traversals/connectivity.md @@ -31,7 +31,7 @@ void dfs(int v, int num) { int num = 0; for (int v = 0; v < n; v++) if (!component[v]) - dfs(v, num++); + dfs(v, ++num); ``` После этого переменная `num` будет хранить число компонент связности, а массив `component` — номер компоненты для каждой вершины, который, например, можно использовать, чтобы быстро проверять, существует ли путь между заданной парой вершин. diff --git a/content/russian/cs/graph-traversals/cycle.md b/content/russian/cs/graph-traversals/cycle.md index 5347e9cd..7a274da1 100644 --- a/content/russian/cs/graph-traversals/cycle.md +++ b/content/russian/cs/graph-traversals/cycle.md @@ -60,6 +60,7 @@ int dfs(int v, int p = -1) { } } } + return -1; } ``` diff --git a/content/russian/cs/interactive/answer-search.md b/content/russian/cs/interactive/answer-search.md index 28e4b4bc..0b38ce24 100644 --- a/content/russian/cs/interactive/answer-search.md +++ b/content/russian/cs/interactive/answer-search.md @@ -66,7 +66,7 @@ int solve() { Здесь, в отличие от предыдущей задачи, кажется, существует прямое решение с формулой. Но вместо того, чтобы о нем думать, можно просто свести задачу к обратной. Давайте подумаем, как по числу минут $t$ (ответу) понять, сколько листов напечатается за это время? Очень легко: $$ -\lfloor\frac{t}{x}\rfloor + \lfloor\frac{t}{y}\rfloor +\left \lfloor \frac{t}{x} \right \rfloor + \left \lfloor \frac{t}{y} \right \rfloor $$ -Ясно, что за $0$ минут $n$ листов распечатать нельзя, а за $xn$ минут один только первый принтер успеет напечатать $n$ листов. Поэтому $0$ и $xn$ — это подходящие изначальные границы для бинарного поиска. +Ясно, что за $0$ минут $n$ листов распечатать нельзя, а за $x \cdot n$ минут один только первый принтер успеет напечатать $n$ листов. Поэтому $0$ и $xn$ — это подходящие изначальные границы для бинарного поиска. diff --git a/content/russian/cs/layer-optimizations/_index.md b/content/russian/cs/layer-optimizations/_index.md index 492473b5..2456aa4c 100644 --- a/content/russian/cs/layer-optimizations/_index.md +++ b/content/russian/cs/layer-optimizations/_index.md @@ -10,10 +10,7 @@ date: 2021-08-29 **Задача.** Даны $n$ точек на прямой, отсортированные по своей координате $x_i$. Нужно найти $m$ отрезков, покрывающих все точки, минимизировав при этом сумму квадратов их длин. -**Базовое решение** — это следующая динамика: - -- $f[i, j]$ = минимальная стоимость покрытия $i$ первых точек, используя не более $j$ отрезков. -- Переход — перебор всех возможных последних отрезков, то есть +**Базовое решение** — определить состояние динамики $f[i, j]$ как минимальную стоимость покрытия $i$ первых точек используя не более $j$ отрезков. Пересчитывать её можно перебором всех возможных последних отрезков: $$ f[i, j] = \min_{k < i} \{f[k, j-1] + (x_{i-1}-x_k)^2 \} @@ -30,7 +27,7 @@ int cost(int i, int j) { } for (int i = 0; i <= m; i++) - f[0][k] = 0; // если нам не нужно ничего покрывать, то всё и так хорошо + f[0][i] = 0; // если нам не нужно ничего покрывать, то всё и так хорошо // все остальные f предполагаем равными бесконечности for (int i = 1; i <= n; i++) diff --git a/content/russian/cs/layer-optimizations/divide-and-conquer.md b/content/russian/cs/layer-optimizations/divide-and-conquer.md index a7731f49..c5e218db 100644 --- a/content/russian/cs/layer-optimizations/divide-and-conquer.md +++ b/content/russian/cs/layer-optimizations/divide-and-conquer.md @@ -8,44 +8,43 @@ published: true *Эта статья — одна из [серии](../). Рекомендуется сначала прочитать все предыдущие.* -Посмотрим на формулу пересчета динамики для базового решения: +Посмотрим на формулу пересчета динамики из базового решения: $$ f[i, j] = \min_{k < i} \{f[k, j-1] + (x_{i-1}-x_k)^2 \} $$ -Обозначим за $opt[i, j]$ оптимальный $k$ для данного состояния — то есть от выражения выше. Для однозначности, если оптимальный индекс не один, то выберем среди них самый правый. +Обозначим за $opt[i, j]$ оптимальный $k$ для данного состояния — то есть аргминимум от выражения выше. Для однозначности, если оптимальный индекс не один, то выберем среди них самый правый. -Конкретно в задаче покрытия точек отрезками, можно заметить следующее: +Конкретно в задаче покрытия точек отрезками можно заметить следующее: $$ -opt[i, j] \leq opt[i, j+1] +opt[i + 1, j] \leq opt[i, j] $$ -Интуиция такая: если у нас появился дополнительный отрезок, то последний отрезок нам не выгодно делать больше, а скорее наоборот его нужно «сжать». +Интуация такая: если нам нужно покрыть больший префикс точек, то начало последнего отрезка точно не будет раньше. -### Идея +### Алгоритм -Пусть мы уже знаем $opt[i, l]$ и $opt[i, r]$ и хотим посчитать $opt[i, j]$ для какого-то $j$ между $l$ и $r$. Тогда, воспользовавшись неравенством выше, мы можем сузить отрезок поиска оптимального индекса для $j$ со всего отрезка $[0, i-1]$ до $[opt[i, l], opt[i, r]]$. +Пусть мы уже знаем $opt[l, k]$ и $opt[r, k]$ и хотим посчитать $opt[i, k]$ для какого-то $i$ между $l$ и $r$. Тогда, воспользовавшись неравенством выше, мы можем сузить отрезок поиска оптимального индекса для $i$ со всего отрезка $[0, i - 1]$ до $[opt[l, k], opt[r, k]]$. -Будем делать следующее: заведем рекурсивную функцию, которая считает динамики для отрезка $[l, r]$, зная, что их $opt$ лежат между $l'$ и $r'$. Эта функция просто берет середину отрезка $[l, r]$ и линейным проходом считает ответ для неё, а затем рекурсивно запускается от половин, передавая в качестве границ $[l', opt]$ и $[opt, r']$ соответственно. - -### Реализация - -Один $k$-тый слой целиком пересчитывается из $(k-1)$-го следующим образом: +Будем делать следующее: заведем рекурсивную функцию, которая считает динамики для отрезка $[l, r]$ на $k$-том слое, зная, что их $opt$ лежат между $l'$ и $r'$. Эта функция просто берет середину отрезка $[l, r]$ и линейным проходом считает ответ для неё, а затем рекурсивно запускается от половин, передавая в качестве границ $[l', opt]$ и $[opt, r']$ соответственно: ```c++ +// [ l, r] -- какие динамики на k-том слое посчитать +// [_l, _r] -- где могут быть их ответы void solve(int l, int r, int _l, int _r, int k) { if (l > r) return; // отрезок пустой -- выходим int opt = _l, t = (l + r) / 2; + // считаем ответ для f[t][k] for (int i = _l; i <= min(_r, t); i++) { int val = f[i + 1][k - 1] + cost(i, t - 1); if (val < f[t][k]) f[t][k] = val, opt = i; } - solve(l, t - 1, _l, opt, k); - solve(t + 1, r, opt, _r, k); + solve(l, t - 1, _l, opt, k); + solve(t + 1, r, opt, _r, k); } ``` @@ -56,8 +55,6 @@ for (int k = 1; k <= m; k++) solve(0, n - 1, 0, n - 1, k); ``` -### Асимптотика - Так как отрезок $[l, r]$ на каждом вызове уменьшается примерно в два раза, глубина рекурсии будет $O(\log n)$. Так как отрезки поиска для всех элементов на одном «уровне» могут пересекаться разве что только по границам, то суммарно на каждом уровне поиск проверит $O(n)$ различных индексов. Соответственно, пересчет всего слоя займет $O(n \log n)$ операций вместо $O(n^2)$ в базовом решении. -Таким образом, мы улучшили асимптотику до $O(n m \log n)$. +Таким образом, мы улучшили асимптотику до $O(n \cdot m \cdot \log n)$. diff --git a/content/russian/cs/layer-optimizations/knuth.md b/content/russian/cs/layer-optimizations/knuth.md index 5c49dbe6..8a184d2d 100644 --- a/content/russian/cs/layer-optimizations/knuth.md +++ b/content/russian/cs/layer-optimizations/knuth.md @@ -9,13 +9,13 @@ prerequisites: Предыдущий метод оптимизации опирался на тот факт, что $opt[i, j] \leq opt[i, j + 1]$. -Асимптотику можно ещё улучшить, заметив, что $opt$ монотонен ещё и по первому параметру: +Асимптотику можно ещё улучшить, заметив, что $opt$ монотонен также и по второму параметру: $$ -opt[i-1, j] \leq opt[i, j] \leq opt[i, j+1] +opt[i - 1, j] \leq opt[i, j] \leq opt[i, j + 1] $$ -В задаче про покрытие отрезками это выполняется примерно по той же причине: если нам нужно покрывать меньше точек, то новый оптимальный последний отрезок будет начинаться не позже старого. +В задаче про покрытие отрезками это выполняется примерно по той же причине: если нам доступно больше отрезков, то последний отрезок в оптимальном решении точно не будет длиннее, чем раньше. ### Алгоритм diff --git a/content/russian/cs/matching/matching-problems.md b/content/russian/cs/matching/matching-problems.md index 12a99f79..cd14e54e 100644 --- a/content/russian/cs/matching/matching-problems.md +++ b/content/russian/cs/matching/matching-problems.md @@ -4,13 +4,14 @@ weight: 3 authors: - Сергей Слотин - Максим Иванов +date: 2022-01-28 --- Алгоритм нахождения паросочетания далеко не настолько сложный, насколько сложно сводить задачи к нему. Начнём с простых примеров. -## Кубики +### Кубики Дано $n$ кубиков, у каждого из них 6 граней, на каждой гране написана какая-то буква. Дано слово $s$, и требуется каждой букве слова $s$ сопоставить уникальный кубик, так чтобы мы могли повернуть этот кубик и получить нужную нам букву. @@ -26,7 +27,7 @@ authors: По определению паросочетания мы не сопоставим ни один кубик нескольким буквам, но так как наше паросочетание — максимально, то мы покроем максимально возможное количество букв. -## Доминошки +### Доминошки Есть прямоугольное поле $n \times m$, которое содержит какие-то выколотые клетки. Надо положить на это поле как можно больше костей домино (прямоугольников размера $1 \times 2$), но с условием, что поверх выколотых полей ничего лежать не должно. @@ -34,7 +35,7 @@ authors: Ответ — максимальное паросочетание в таком графе. Асимптотика с алгоритмом Куна $O(n^2 m^2)$, потому что у нас будет $O(nm)$ вершин и рёбер. -## Покрытие путями DAG +### Покрытие путями DAG Разберем более сложную задачу, до решения которой самостоятельно додуматься сложно. @@ -54,21 +55,19 @@ authors: Мы теперь можем свести задачу к нахождению максимального паросочетания в двудольном графе $H$. После нахождения этого паросочетания мы должны преобразовать его в набор путей в $G$. Это делается тривиальным алгоритмом: возьмем $a_1$, посмотрим, с какой $b_k$ она соединена, посмотрим на $a_k$ и так далее. Некоторые вершины могут остаться ненасыщенными — в таком случае в ответ надо добавить пути нулевой длины из каждой из этих вершин. -## Минимальное вершинное покрытие +### Минимальное вершинное покрытие -**Задача**. Дан граф. Назовем *вершинным покрытием* такое множество вершин, что каждое ребро графа инцидентно хотя бы одной вершине из множества. Необходимо найти вершинное покрытие наименьшего размера. +**Задача**. Назовем *вершинным покрытием* графа такое множество вершин, что каждое ребро графа инцидентно хотя бы одной вершине из множества. Необходимо найти вершинное покрытие наименьшего размера. -Следует заметить, что в общем случае это очень сложная задача, но для двудольных графов она имеет достаточно простое решение. +В общем случае это NP-полная задача, но для двудольных графов она имеет достаточно простое решение. -**Теорема**. $\mid V_{min} \mid \le \mid M \mid$, где $V_{min}$ — минимальное вершинное покрытие, а $M$ — максимальное паросочетание. - -**Доказательство**. $\mid V_{min} \mid \ge \mid M \mid$, поскольку $M$ — множество независимых ребер. Теперь приведем алгоритм, который строит вершинное покрытие размера $\mid M \mid$. Очевидно, оно будет минимальным. +Обозначим за $V_{min}$ наименьшее вершинное покрытие, а за $M$ — максимальное паросочетание в графе. Тогда сразу заметим, что $|V_{min}| \ge |M|$, потому что $M$ — множество независимых ребер. Теперь приведем алгоритм, который строит вершинное покрытие размера ровно $|M|$. Очевидно, оно будет минимальным. **Алгоритм**. Мысленно ориентируем ребра графа: ребра из $M$ проведем из правой доли в левую, остальные — из левой в правую, после чего запустим обход в глубину из всех вершин левой доли, не включенных в $M$. ![](https://neerc.ifmo.ru/wiki/images/4/4c/Bipartdfs_right.jpg) -Заметим, что граф разбился на несколько множеств: $L^+, L^-, R^+, R^-$, где "плюсовые" множества — это множества посещенных в процессе обхода вершин. В графе такого вида не бывает ребер $L^+ \rightarrow R^-$, $L^- \leftarrow R^+$ по очевидным соображениям. Ребер $L^+ \leftarrow R^-$ не бывает, потому что в противном случае паросочетание $M$ не максимальное — его можно дополнить ребрами такого типа. +Заметим, что граф разбился на несколько множеств: $L^+, L^-, R^+, R^-$, где «плюсовые» множества — это множества посещенных в процессе обхода вершин. В графе такого вида не бывает ребер $L^+ \rightarrow R^-$ и $L^- \leftarrow R^+$ по очевидным соображениям. Ребер $L^+ \leftarrow R^-$ не бывает, потому что в противном случае паросочетание $M$ не максимальное — его можно дополнить ребрами такого типа. $$ L^- \cup R^+ = V_{min} @@ -78,12 +77,10 @@ $$ **Упражнение**. Подумайте, как это можно применить к решению задачи о нахождении [максимального независимого множества](https://neerc.ifmo.ru/wiki/index.php?title=%D0%A1%D0%B2%D1%8F%D0%B7%D1%8C_%D0%B2%D0%B5%D1%80%D1%88%D0%B8%D0%BD%D0%BD%D0%BE%D0%B3%D0%BE_%D0%BF%D0%BE%D0%BA%D1%80%D1%8B%D1%82%D0%B8%D1%8F_%D0%B8_%D0%BD%D0%B5%D0%B7%D0%B0%D0%B2%D0%B8%D1%81%D0%B8%D0%BC%D0%BE%D0%B3%D0%BE_%D0%BC%D0%BD%D0%BE%D0%B6%D0%B5%D1%81%D1%82%D0%B2%D0%B0). -## Паросочетание минимального веса +### Паросочетание минимального веса Пусть у вершин левой доли есть какие-то веса, и нам нужно набрать максимальное паросочетание минимального веса. -Выясняется, что можно просто отсортировать вершины левой доли по весу и пытаться в таком порядке добавлять их в паросочетание стандартным алгоритмом Куна. - -Для доказательства этого факта читатель может прочитать про [жадный алгоритм Радо-Эдмондса](/cs/greedy/matroid), частным случаем которого является такая модификация алгоритма Куна. +Выясняется, что можно просто отсортировать вершины левой доли по весу и пытаться в таком порядке добавлять их в паросочетание стандартным алгоритмом Куна. Для доказательства этого факта читатель может прочитать про [жадный алгоритм Радо-Эдмондса](/cs/combinatorial-optimization/matroid), частным случаем которого является такая модификация алгоритма Куна. -[Аналогичную задачу](/cs/flows/mincost-maxflow), когда у *ребер* есть веса, проще всего решать сведением к потоку минимальной стоимости. +Аналогичную задачу, но когда у *ребер* есть веса, проще всего решать сведением к нахождению [потока минимальной стоимости](/cs/flows/mincost-maxflow). diff --git a/content/russian/cs/modular/reciprocal.md b/content/russian/cs/modular/reciprocal.md index 5d0e34e9..7b966de3 100644 --- a/content/russian/cs/modular/reciprocal.md +++ b/content/russian/cs/modular/reciprocal.md @@ -99,7 +99,7 @@ $$ ax + my = 1 \iff ax \equiv 1 \iff x \equiv a^{-1} \pmod m $$ int inv(int a, int m) { if (a == 1) return 1; - return (1 - inv(m % a, a) * m) / a + m; + return (1 - 1ll * inv(m % a, a) * m) / a + m; } ``` diff --git a/content/russian/cs/numerical/newton.md b/content/russian/cs/numerical/newton.md index 248e1b4e..5426cff5 100644 --- a/content/russian/cs/numerical/newton.md +++ b/content/russian/cs/numerical/newton.md @@ -66,9 +66,9 @@ double sqrt(double n) { Запустим метод Ньютона для поиска квадратного корня $2$, начиная с $x_0 = 1$, и посмотрим, сколько первых цифр оказались правильными после каждой итерации: -
-1
-1.5
+
+1.0000000000000000000000000000000000000000000000000000000000000
+1.5000000000000000000000000000000000000000000000000000000000000
 1.4166666666666666666666666666666666666666666666666666666666675
 1.4142156862745098039215686274509803921568627450980392156862745
 1.4142135623746899106262955788901349101165596221157440445849057
diff --git a/content/russian/cs/persistent/persistent-array.md b/content/russian/cs/persistent/persistent-array.md
index e476c355..018c287a 100644
--- a/content/russian/cs/persistent/persistent-array.md
+++ b/content/russian/cs/persistent/persistent-array.md
@@ -2,8 +2,9 @@
 title: Структуры с откатами
 weight: 1
 authors:
-- Сергей Слотин
-date: 2021-09-12
+  - Сергей Слотин
+date: {}
+published: true
 ---
 
 Состояние любой структуры как-то лежит в памяти: в каких-то массивах, или в более общем случае, по каким-то определенным адресам в памяти. Для простоты, пусть у нас есть некоторый массив $a$ размера $n$, и нам нужно обрабатывать запросы присвоения и чтения, а также иногда откатывать изменения обратно.
@@ -20,7 +21,7 @@ int a[N];
 stack< pair > s;
 
 void change(int k, int x) {
-    l.push({k, a[k]});
+    s.push({k, a[k]});
     a[k] = x;
 }
 
@@ -84,7 +85,7 @@ void rollback() {
 
 ```cpp
 int t = 0;
-vector versions[N];
+vector< pair > versions[N];
 
 void change(int k, int x) {
     versions[k].push_back({t++, x});
diff --git a/content/russian/cs/programming/bayans.md b/content/russian/cs/programming/bayans.md
index bb8b4471..d7b42267 100644
--- a/content/russian/cs/programming/bayans.md
+++ b/content/russian/cs/programming/bayans.md
@@ -4,11 +4,12 @@ weight: 100
 authors:
 - Сергей Слотин
 created: 2017-2019
+date: 2022-07-17
 ---
 
 Везде, где не указано — время работы $O(n)$, а если есть конкретные числа, то TL 1 секунда.
 
-Задачи идут в порядке вспоминания, то есть в весьма рандомном.
+Задачи идут в порядке вспоминания/придумывания, то есть в весьма рандомном.
 
 ## Попугаи
 
@@ -56,6 +57,26 @@ created: 2017-2019
 
 На стороне интерактора был сгенерирован массив из $n$ случайных чисел, *равномерно распределенных* на некотором промежутке, а затем отсортирован. За один запрос можно по индексу в массиве узнать число, которое там лежит. Требуется за $O(\log \log n)$ операций *в среднем* определять, есть ли в данном массиве число $x$.
 
+## Случайный бинарный поиск
+
+Дан такой бинарный поиск, в котором мы вместо элемента из середины берем случайный:
+
+```c++
+int lower_bound(int x) {
+    int l = 0, r = n - 1;
+    while (l < r) {
+        int m = l + rand() % (r - l);
+        if (t[m] >= x)
+            r = m;
+        else
+            l = m + 1;
+    }
+    return l;
+}
+```
+
+Все ключи и запросы случайные. При $n \to \infty$, во сколько *в среднем* раз больше итераций рандомизированный бинарный поиск будет делать по сравнению с обычным?
+
 ## Замкнутые ломаные
 
 Даны две замкнутые несамопересекающиеся ломаные. Определите, можно ли перевести их друг в друга с помощью параллельного переноса, поворотов и гомотетии?
@@ -101,12 +122,24 @@ created: 2017-2019
 
 ## Нулевая сумма
 
-Дано  мультимножество из $n$ целых чисел. Найдите любое его подмножество, сумма чисел которого делится на $n$.
+Дано мультимножество из $n$ целых чисел. Найдите любое его непустое подмножество, сумма чисел которого делится на $n$.
 
 ## Мета-задача
 
 В задаче дана произвольная строка, по которой известным только авторам способом генерируется ответ yes/no. В задаче 100 тестов. У вас есть 20 попыток. В качестве фидбэка вам доступны вердикты на каждом тесте. Вердикта всего два: OK (ответ совпал) и WA. Попытки поделить на ноль, выделить терабайт памяти и подобное тоже считаются как WA. «Решите» задачу.
 
+## Мета-задача 2
+
+Условие как в «Мета-задаче», но сообщается только число пройденных тестов.
+
+100 тестов, 70 попыток.
+
+## Мета-задача 3
+
+Условие как в «Мета-задаче», но сообщается только номер первого не пройденного теста.
+
+10 тестов, 100 попыток.
+
 ## Ниточка
 
 В плоскую доску вбили $n$ гвоздей радиуса $r$, причём так, что соответствующие точки на плоскости образуют вершины выпуклого многоугольника. На эти гвозди натянули ниточку, причём ниточка «огибает» по кругу гвозди. Найдите длину ниточки, то есть периметр этого многоугольника с учётом закругления.
@@ -282,3 +315,56 @@ def query(y):
 ```
 
 Ваша задача — отгадать число, используя не более 10000 попыток.
+
+## Коммивояжер
+
+Даны $3 \cdot 10^5$ точек на плоскости. Выберите среди них любое подмножество из 500 точек и решите для него задачу коммивояжера: найдите минимальный по длине цикл, проходящий через все эти точки.
+
+## Анаграммы
+
+Найдите в строке $s$ первую подстроку, являющуюся анаграммой (пререстановкой символов) строки $t$ за $O(n)$.
+
+## Функциональный граф
+
+Дан ориентированный граф из $n < 10^5$ вершин, в котором из каждой вершины ведет ровно одно ребро. Требуется ответить на $q < 10^5$ запросов «в какую вершину мы попадем, если начнем в вершине $v_i$ и сделаем $k_i < 10^{18}$ переходов» за время $O(q + n)$.
+
+## Асинхронная шляпа
+
+Серёжа и его $(n - 1)$ друзей решили поиграть в «шляпу», в которой один игрок должен за ограниченное время объяснить как можно больше слов, чтобы его партнер их отгадал.
+
+Каждый игрок должен пообщаться с любым другим по разу; обычно игра проводится так:
+
+- 1-й игрок объясняет в течение минуты слова 2-му,
+- 2-й игрок объясняет слова 3-му,
+- ...,
+- $n$-й игрок объясняет слова 1-му,
+- 1-й игрок объясняет слова 3-му,
+- 2-й игрок объясняет слова 4-му…
+
+…и так далее, пока $(n-1)$-й игрок не закончит объяснять слова $(n-2)$-ому.
+
+Если друзей собралось много, то игра может занять приличное время. Серёжу интересует, какое минимальное время она может длиться, если разрешить парам участников общаться между собой одновременно и в любом порядке.
+
+Для данного $n \le 500$, найдите минимальное количество времени $k$ и соответствующее ему расписание.
+
+## Random coffee
+
+В компании, в которой вы работаете, устроено неизвестное число людей — от одного до бесконечности с равной вероятностью. Для борьбы с одиночеством, каждый сотрудник участвует в «random coffee»: каждую неделю вы встречаетесь со случайным человеком из компании, чтобы попить кофе и обсудить что угодно.
+
+Вы участвовали в random coffee $n$ раз и пообщались с $k$ разными людьми (с некоторыми — более одного раза). Какое наиболее вероятное число человек работает в компании?
+
+## Мафия
+
+В «мафию» играют 13 человек, из которых 10 мирных и 3 мафии. Все роли розданы с помощью стандартной колоды игральных карт: заранее выбрали и перемешали 10 красных и 3 чёрные карты, кто вытянул черную — мафия. Все карты различны и известны всем. Игра начинается с дневного голосования.
+
+Как мирным гарантированно победить?
+
+
+
+
diff --git a/content/russian/cs/programming/stress-test.md b/content/russian/cs/programming/stress-test.md
index b20c77b6..c67d1237 100644
--- a/content/russian/cs/programming/stress-test.md
+++ b/content/russian/cs/programming/stress-test.md
@@ -151,12 +151,12 @@ _, f1, f2, gen, iters = sys.argv
 
 for i in range(int(iters)):
     print('Test', i + 1)
-    os.popen('python3 %s > test.txt' % gen)
-    v1 = os.popen('./%s < test.txt' % f1).read()
-    v2 = os.popen('./%s < test.txt' % f2).read()
+    os.system(f'python3 {gen} > test.txt')
+    v1 = os.popen(f'./{f1} < test.txt').read()
+    v2 = os.popen(f'./{f2} < test.txt').read()
     if v1 != v2:
         print("Failed test:")
-        print(open("text.txt").read())
+        print(open("test.txt").read())
         print(f'Output of {f1}:')
         print(v1)
         print(f'Output of {f2}:')
diff --git a/content/russian/cs/range-queries/fenwick.md b/content/russian/cs/range-queries/fenwick.md
index f07a1ed4..9e37fc8d 100644
--- a/content/russian/cs/range-queries/fenwick.md
+++ b/content/russian/cs/range-queries/fenwick.md
@@ -84,7 +84,7 @@ int sum (int r1, int r2) {
     int res = 0;
     for (int i = r1; i > 0; i -= i & -i)
         for (int j = r2; j > 0; j -= j & -j)
-            ans += t[i][j];
+            res += t[i][j];
     return res;
 }
 ```
diff --git a/content/russian/cs/range-queries/img/prefix-sum.png b/content/russian/cs/range-queries/img/prefix-sum.png
new file mode 100644
index 00000000..4e00190a
Binary files /dev/null and b/content/russian/cs/range-queries/img/prefix-sum.png differ
diff --git a/content/russian/cs/range-queries/prefix-sum.md b/content/russian/cs/range-queries/prefix-sum.md
index 861200a1..f4e02570 100644
--- a/content/russian/cs/range-queries/prefix-sum.md
+++ b/content/russian/cs/range-queries/prefix-sum.md
@@ -52,13 +52,15 @@ $$
 
 Для ответа на запрос поиска суммы на произвольном полуинтервале нужно просто вычесть друг из друга две предподсчитанные префиксные суммы.
 
-@@
+
+
+![](../img/prefix-sum.png)
 
 ### Другие операции
 
diff --git a/content/russian/cs/range-queries/sqrt-structures.md b/content/russian/cs/range-queries/sqrt-structures.md
index bac0da16..25fe3b5e 100644
--- a/content/russian/cs/range-queries/sqrt-structures.md
+++ b/content/russian/cs/range-queries/sqrt-structures.md
@@ -1,10 +1,10 @@
 ---
 title: Корневые структуры
 authors:
-- Сергей Слотин
-- Иван Сафонов
+  - Сергей Слотин
+  - Иван Сафонов
 weight: 6
-date: 2021-09-13
+date: 2022-08-16
 ---
 
 Корневые оптимизации можно использовать много для чего, в частности в контексте структур данных.
@@ -23,16 +23,15 @@ date: 2021-09-13
 ```c++
 // c это и количество блоков, и также их размер; оно должно быть чуть больше корня
 const int maxn = 1e5, c = 330;
-int a[maxn], b[c];
-int add[c];
+int a[maxn], b[c], add[c];
 
 for (int i = 0; i < n; i++)
     b[i / c] += a[i];
 ```
 
-Заведем также массив `add` размера $\sqrt n$, который будем использовать для отложенной операции прибавления на блоке. Будем считать, что реальное значение $i$-го элемента равно `a[i] + add[i / c]`.
+Заведем также массив `add` размера $\sqrt n$, который будем использовать для отложенной операции прибавления на блоке: будем считать, что реальное значение $i$-го элемента равно `a[i] + add[i / c]`.
 
-Теперь мы можем отвечать на запросы первого типа за $O(\sqrt n)$ на запрос:
+Теперь мы можем отвечать на запросы первого типа за $O(\sqrt n)$ операций на запрос:
 
 1. Для всех блоков, лежащих целиком внутри запроса, просто возьмём уже посчитанные суммы и сложим.
 2. Для блоков, пересекающихся с запросом только частично (их максимум два — правый и левый), проитерируемся по нужным элементам и поштучно прибавим к ответу.
@@ -68,6 +67,7 @@ void upd(int l, int r, int x) {
             l += c;
         }
         else {
+            b[l / c] += x;
             a[l] += x;
             l++;
         }
@@ -111,8 +111,8 @@ vector< vector > blocks;
 // возвращает индекс блока и индекс элемента внутри блока
 pair find_block(int pos) {
     int idx = 0;
-    while (blocks[idx].size() >= pos)
-        pos -= blocks[idx--].size();
+    while (blocks[idx].size() <= pos)
+        pos -= blocks[idx++].size();
     return {idx, pos};
 }
 ```
diff --git a/content/russian/cs/segment-tree/lazy-initialization.md b/content/russian/cs/segment-tree/lazy-initialization.md
index 3e789b30..d8a4bd49 100644
--- a/content/russian/cs/segment-tree/lazy-initialization.md
+++ b/content/russian/cs/segment-tree/lazy-initialization.md
@@ -6,9 +6,9 @@ prerequisites:
 - lazy-propagation
 ---
 
-Рассмотрим нашу любимую задачу суммы на подотрезках, но теперь все индексы лежат не от в пределах $10^6$, а $10^9$ или даже $10^{18}$.
+Рассмотрим нашу любимую задачу суммы на подотрезках, но теперь все индексы лежат не в пределах $10^5$ или $10^6$, а до $10^9$ или даже $10^{18}$.
 
-Все асимптотики нас по прежнему устраивают:
+Все асимптотики нас по прежнему более-менее устраивают:
 
 $$
     \log_2 10^6 \approx 20
@@ -16,9 +16,9 @@ $$
 \\  \log_2 10^{18} \approx 60
 $$
 
-кроме этапа построения, работающего за линейное от $n$ время.
+Единственная проблема — это этап построения, работающий за линейное от $n$ время и память.
 
-Можно решить эту проблему так: откажемся от явного создания всех вершин дерева в самом начале. Изначально создадим только лишь корень, а остальные вершины будем создавать на ходу, когда в них потребуется записать что-то не дефолтное — как в lazy propagation.
+Решить её можно отказавшись от явного создания всех вершин дерева в самом начале. Изначально создадим только лишь корень, а остальные вершины будем создавать на ходу, когда в них потребуется записать что-то не дефолтное — как в [lazy propagation](../lazy-propagation):
 
 ```cpp
 struct Segtree {
diff --git a/content/russian/cs/sequences/_index.md b/content/russian/cs/sequences/_index.md
index d02ed49b..6888831d 100644
--- a/content/russian/cs/sequences/_index.md
+++ b/content/russian/cs/sequences/_index.md
@@ -1,7 +1,6 @@
 ---
 title: Последовательности
 weight: 4
-draft: true
 ---
 
-В этой главе рассматриваются некоторые алгоритмы на неотсортированных последовательностях.
+В этой главе рассматриваются алгоритмы для неотсортированных последовательностей.
diff --git a/content/russian/cs/sequences/compression.md b/content/russian/cs/sequences/compression.md
index ffd0bd79..5b469fec 100644
--- a/content/russian/cs/sequences/compression.md
+++ b/content/russian/cs/sequences/compression.md
@@ -3,45 +3,64 @@ title: Сжатие координат
 authors:
 - Сергей Слотин
 weight: -1
+date: 2022-04-20
 ---
 
+Часто бывает полезно преобразовать последовательность чисел либо каких-то других объектов в промежуток последовательных целых чисел — например, чтобы использовать её элементы как индексы в массиве либо какой-нибудь другой структуре.
 
-## Сжатие координат
-Это общая идея, которая может оказаться полезной. Пусть, есть $n$ чисел $a_1,\ldots,a_n$. Хотим, преобразовать $a_i$ так, чтобы равные остались равными, разные остались разными, но все они были от 0 до $n-1$. Для этого надо отсортировать числа, удалить повторяющиеся и заменить каждое $a_i$ на его индекс в отсортированном массиве.
+Эта задача эквивалентна нумерации элементов множества, что можно сделать за $O(n)$ через хеш-таблицу:
 
+```c++
+vector compress(vector a) {
+    unordered_map m;
 
-```
-int a[n], all[n];
-for (int i = 0; i < n; ++i) {
-    cin >> a[i];
-    all[i] = a[i];
+    for (int &x : a) {
+        if (m.count(x))
+            x = m[x];
+        else
+            m[x] = m.size();
+    }
+
+    return a;
 }
-sort(all, all + n);
-m = unique(all, all + n) - all; // теперь m - число различных координат
-for (int i = 0; i < n; ++i)
-    a[i] = lower_bound(all, all + m, x[i]) - all;
 ```
 
-```cpp
+Элементам будут присвоены номера в порядке их первого вхождения в последовательность. Если нужно сохранить *порядок*, присвоив меньшим элементам меньшие номера, то задача становится чуть сложнее, и её можно решить разными способами.
+
+Как вариант, можно отсортировать массив, а затем два раза пройтись по нему с хэш-таблицей — в первый раз заполняя её, а во второй раз сжимая сам массив:
+
+```c++
 vector compress(vector a) {
+    vector b = a;
+    sort(b.begin(), b.end());
+
     unordered_map m;
-    for (int x : a)
-        if (m.count(x))
+
+    for (int x : b)
+        if (!m.count(x))
             m[x] = m.size();
+
     for (int &x : a)
         x = m[x];
+
     return a;
 }
 ```
 
+Также можно выкинуть из отсортированного массива дупликаты (за линейное время), а затем использовать его для нахождения индекса каждого элемента исходного массива бинарным поиском:
 
-```cpp
+```c++
 vector compress(vector a) {
     vector b = a;
+
     sort(b.begin(), b.end());
     b.erase(unique(b.begin(), b.end()), b.end());
+
     for (int &x : a)
         x = int(lower_bound(b.begin(), b.end(), x) - b.begin());
+
     return a;
 }
 ```
+
+Оба подхода работают за $O(n \log n)$. Используйте тот, который больше нравится.
diff --git a/content/russian/cs/sequences/inversions.md b/content/russian/cs/sequences/inversions.md
index f18d1f4a..2fbec7d9 100644
--- a/content/russian/cs/sequences/inversions.md
+++ b/content/russian/cs/sequences/inversions.md
@@ -4,13 +4,18 @@ title: Число инверсий
 weight: 5
 authors:
 - Сергей Слотин
+draft: true
 ---
 
-Пусть у нас есть некоторая перестановка $p$ (какая-то последовательность чисел от $1$ до $n$, где все числа встречаются ровно один раз). *Инверсией* называется пара индексов $i$ и $j$ такая, что $i < j$ и $p_i > p_j$. Требуется найти количество инверсий в данной перестановке.
+**Определение.** *Инверсией* в перестановке $p$ называется пара индексов $i$ и $j$ такая, что $i < j$ и $p_i > p_j$.
 
-## Наивный алгоритм
+Например:
 
-Эта задача легко решается за $O(n^2)$ обычным перебором всех пар индексов и проверкой каждого на инверсию:
+- в перестановке $[1, 2, 3]$ инверсий нет,
+- в $[1, 3, 2]$ одна инверсия ($3 \leftrightarrow 2$),
+- в $[3, 2, 1]$ три инверсии ($3 \leftrightarrow 2$, $3 \leftrightarrow 1$ и $2 \leftrightarrow 1$).
+
+В этой статье мы рассмотрим, как находить количество инверсий в перестановке. Эта задача легко решается за $O(n^2)$ обычным перебором всех пар индексов и проверкой каждого на инверсию:
 
 ```cpp
 int count_inversions(int *p, int n) {
@@ -23,6 +28,8 @@ int count_inversions(int *p, int n) {
 }
 ```
 
+Решить её быстрее сложнее.
+
 ## Сортировкой слиянием
 
 Внезапно эту задачу можно решить сортировкой слиянием, слегка модифицировав её.
diff --git a/content/russian/cs/sequences/quickselect.md b/content/russian/cs/sequences/quickselect.md
index b1606bbd..7e83a267 100644
--- a/content/russian/cs/sequences/quickselect.md
+++ b/content/russian/cs/sequences/quickselect.md
@@ -1,12 +1,12 @@
 ---
-# TODO: реализация
 title: Порядковые статистики
 weight: 4
+draft: true
 ---
 
 Если в [начале предыдущей главы](/cs/interactive/binary-search) мы искали число элементов массива, меньших $x$ — также известное как индекс этого элемента в отсортированном массиве — то теперь нас интересует обратная задача: узнать, какой элемент $k$-тый по возрастанию.
 
-Если массив уже отсортирован, то задача тривиальная — просто берем $k$-тый элемент. Иначе мы его можем отсортировать, но на это потребуется $O(n \log n)$ операций — и мы знаем, что используя только сравнения быстрее не получится.
+Если массив уже отсортирован, то задача тривиальная: просто берем $k$-тый элемент. Иначе мы его можем отсортировать, но на это потребуется $O(n \log n)$ операций — и мы знаем, что если мы используем только сравнения, быстрее не получится.
 
 Есть другой подход — мы можем модифицировать алгоритм быстрой сортировки.
 
@@ -26,4 +26,17 @@ weight: 4
 
 Подумав над тем, что размер отрезка каждый раз убывает приблизительно в 2 раза, над ограниченностью суммы $n + \frac{n}{2} + \frac{n}{4} + \ldots = 2 \cdot n$, и немного помахав руками, получаем, что алгоритм работает за $O(n)$. 
 
+
+
 В C++ этот алгоритм уже реализован и доступен как `nth_element`.
diff --git a/content/russian/cs/set-structures/dsu.md b/content/russian/cs/set-structures/dsu.md
index 6c9a4d80..ee437a43 100644
--- a/content/russian/cs/set-structures/dsu.md
+++ b/content/russian/cs/set-structures/dsu.md
@@ -66,7 +66,7 @@ int leader(int v) {
 
 Следующие две эвристики похожи по смыслу и стараются оптимизировать высоту дерева, выбирая оптимальный корень для переподвешивания.
 
-**Ранговая эвристика**. Будем хранить для каждой вершины её *ранг* — высоту её поддереа. При объединении деревьев будем делать корнем нового дерева ту вершину, у которой ранг больше, и пересчитывать ранги (ранг у лидера должен увеличиться на единицу, если он совпадал с рангом другой вершины). Эта эвристика оптимизирует высоту дерева напрямую.
+**Ранговая эвристика**. Будем хранить для каждой вершины её *ранг* — высоту её поддерева. При объединении деревьев будем делать корнем нового дерева ту вершину, у которой ранг больше, и пересчитывать ранги (ранг у лидера должен увеличиться на единицу, если он совпадал с рангом другой вершины). Эта эвристика оптимизирует высоту дерева напрямую.
 
 ```cpp
 void unite(int a, int b) {
diff --git a/content/russian/cs/shortest-paths/bfs.md b/content/russian/cs/shortest-paths/bfs.md
index 1893d5db..c96804c4 100644
--- a/content/russian/cs/shortest-paths/bfs.md
+++ b/content/russian/cs/shortest-paths/bfs.md
@@ -1,13 +1,14 @@
 ---
 title: Поиск в ширину
 authors:
-- Александр Гришутин
-- Станислав Алексеев
-- "[Максим Иванов](https://e-maxx.ru/algo/bfs)"
+  - Александр Гришутин
+  - Станислав Алексеев
+  - '[Максим Иванов](https://e-maxx.ru/algo/bfs)'
 editors:
-- Сергей Слотин
+  - Сергей Слотин
 weight: 2
-date: 2021-09-30
+date: {}
+published: true
 ---
 
 *Поиск в ширину* (англ. *breadth-first search*) — один из основных алгоритмов на графах, позволяющий находить все кратчайшие пути от заданной вершины и решать многие другие задачи.
@@ -158,7 +159,7 @@ $$
 vector d(n, -1);
 d[s] = 0;
 
-dequeue q;
+deque q;
 q.push_back(s);
 
 while (!q.empty()) {
diff --git a/content/russian/cs/sorting/bubble.md b/content/russian/cs/sorting/bubble.md
index 2d9af9b5..38fa5c8a 100644
--- a/content/russian/cs/sorting/bubble.md
+++ b/content/russian/cs/sorting/bubble.md
@@ -1,9 +1,10 @@
 ---
 title: Сортировка пузырьком
 weight: 1
+published: true
 ---
 
-Наш первый подход будет заключаться в следующем: обозначим за $n$ длину массива и $n$ раз пройдёмся раз пройдемся по нему слева направо, меняя два соседних элемента, если первый больше второго.
+Наш первый подход будет заключаться в следующем: обозначим за $n$ длину массива и $n$ раз пройдёмся по нему слева направо, меняя два соседних элемента, если первый больше второго.
 
 Каждую итерацию максимальный элемент «всплывает» как пузырек к концу массива — отсюда и название.
 
diff --git a/content/russian/cs/sorting/quicksort.md b/content/russian/cs/sorting/quicksort.md
index f3a6a5d6..e6494cd3 100644
--- a/content/russian/cs/sorting/quicksort.md
+++ b/content/russian/cs/sorting/quicksort.md
@@ -7,13 +7,18 @@ draft: true
 Быстрая сортировка заключается в том, что на каждом шаге мы находим опорный элемент, все элементы, которые меньше его кидаем в левую часть, остальные в правую, а затем рекурсивно спускаемся в обе части.
 
 ```cpp
+// partition - функция разбивающие элементы 
+// на меньшие и больше/равные a[index], 
+// при этом функция возвращает границу разбиения
+void partition(int l, int r, int p) {
+
+}
+
 void quicksort(int l, int r){
     if (l < r){
         int index = (l + r) / 2; /* index - индекс опорного элемента для 
         начала сделаем его равным середине отрезка*/
-        index = divide(l, r, index); /* divide - функция разбивающие элементы 
-        на меньшие и больше/равные a[index], 
-        при этом функция возвращает границу разбиения*/
+        index = partition(l, r, index);
         quicksort(l, index);
         quicksort(index + 1, r);
     }
@@ -25,8 +30,6 @@ void quicksort(int l, int r){
 
 Существуют несколько выходов из этой ситуации :
 
-2. Давайте если быстрая сортировка работает долго, то запустим любую другую сортировку за $NlogN$.
-
-3. Давайте делить массив не на две, а на три части(меньше, равны, больше).
-
-4. Чтобы избавиться от проблемы с максимумом/минимумом в середине, давайте **брать случайный элемент**.
+1. Давайте если быстрая сортировка работает долго, то запустим любую другую сортировку за $NlogN$.
+2. Давайте делить массив не на две, а на три части(меньше, равны, больше).
+3. Чтобы избавиться от проблемы с максимумом/минимумом в середине, давайте **брать случайный элемент**.
diff --git a/content/russian/cs/sorting/selection.md b/content/russian/cs/sorting/selection.md
index 03491dec..30854b5f 100644
--- a/content/russian/cs/sorting/selection.md
+++ b/content/russian/cs/sorting/selection.md
@@ -1,6 +1,7 @@
 ---
 title: Сортировка выбором
 weight: 2
+published: true
 ---
 
 Похожим методом является **сортировка выбором** (минимума или максимума).
@@ -9,10 +10,10 @@ weight: 2
 
 ```cpp
 void selection_sort(int *a, int n) {
-    for (k = 0; k < n - 1; k++)
-        for (j = i + 1; j < n; j++)
-            if (a[i] > a[j])
-                swap(a[j], a[i]);
+    for (int k = 0; k < n - 1; k++)
+        for (int j = k + 1; j < n; j++)
+            if (a[k] > a[j])
+                swap(a[j], a[k]);
 }
 ```
 
diff --git a/content/russian/cs/spanning-trees/kruskal.md b/content/russian/cs/spanning-trees/kruskal.md
index ddb9cabf..1f4c98a4 100644
--- a/content/russian/cs/spanning-trees/kruskal.md
+++ b/content/russian/cs/spanning-trees/kruskal.md
@@ -34,4 +34,4 @@ for (auto [a, b, w] : edges) {
 }
 ```
 
-Раз остовные деревья являются частным случаем [матроида](/cs/greedy/matroid), то алгоритм Краскала является частным случаем алгоритма Радо-Эдмондса.
+Раз остовные деревья являются частным случаем [матроида](/cs/combinatorial-optimization/matroid), то алгоритм Краскала является частным случаем алгоритма Радо-Эдмондса.
diff --git a/content/russian/cs/spanning-trees/prim.md b/content/russian/cs/spanning-trees/prim.md
index d9a00c6e..ff250c70 100644
--- a/content/russian/cs/spanning-trees/prim.md
+++ b/content/russian/cs/spanning-trees/prim.md
@@ -2,7 +2,8 @@
 title: Алгоритм Прима
 weight: 2
 prerequisites:
-- safe-edge
+  - safe-edge
+published: true
 ---
 
 Лемма о безопасном ребре говорит, что мы можем строить минимальный остов постепенно, добавляя по одному ребра, про которые мы точно знаем, что они минимальные для соединения какого-то разреза.
@@ -47,7 +48,7 @@ min_edge[0] = 0;
 
 for (int i = 0; i < n; i++) {
     int v = -1;
-    for (int u = 0; u < n; j++)
+    for (int u = 0; u < n; u++)
         if (!used[u] && (v == -1 || min_edge[u] < min_edge[v]))
             v = u;
 
diff --git a/content/russian/cs/spanning-trees/safe-edge.md b/content/russian/cs/spanning-trees/safe-edge.md
index cc7138c9..19f97006 100644
--- a/content/russian/cs/spanning-trees/safe-edge.md
+++ b/content/russian/cs/spanning-trees/safe-edge.md
@@ -24,4 +24,4 @@ weight: 1
 - Если веса всех рёбер различны, то остов будет уникален.
 - Минимальный остов является также и остовом с минимальным произведением весов рёбер (замените веса всех рёбер на их логарифмы).
 - Минимальный остов является также и остовом с минимальным весом самого тяжелого ребра.
-- Остовные деревья — частный случай [матроидов](/cs/greedy/matroid).
+- Остовные деревья — частный случай [матроидов](/cs/combinatorial-optimization/matroid).
diff --git a/content/russian/cs/string-searching/manacher.md b/content/russian/cs/string-searching/manacher.md
index 8954b653..16d32ccb 100644
--- a/content/russian/cs/string-searching/manacher.md
+++ b/content/russian/cs/string-searching/manacher.md
@@ -32,7 +32,7 @@ vector pal_array(string s) {
 
 Тот же пример $s = aa\dots a$ показывает, что данная реализация работает за $O(n^2)$.
 
-Для оптимизации применим идею, знакомую из алгоритма [z-функции](string-searching): при инициализации $t_i$ будем пользоваться уже посчитанными $t$. А именно, будем поддерживать $(l, r)$ — интервал, соответствующий самому правому из найденных подпалиндромов. Тогда мы можем сказать, что часть наибольшего палиндрома с центром в $s_i$, которая лежит внутри $s_{l:r}$, имеет радиус хотя бы $\min(r-i, \; t_{l+r-i})$. Первая величина равна длине, дальше которой произошел бы выход за пределы $s_{l:r}$, а вторая — значению радиуса в позиции, зеркальной относительно центра палиндрома $s_{l:r}$.
+Для оптимизации применим идею, знакомую из алгоритма [z-функции](/cs/string-searching/z-function/): при инициализации $t_i$ будем пользоваться уже посчитанными $t$. А именно, будем поддерживать $(l, r)$ — интервал, соответствующий самому правому из найденных подпалиндромов. Тогда мы можем сказать, что часть наибольшего палиндрома с центром в $s_i$, которая лежит внутри $s_{l:r}$, имеет радиус хотя бы $\min(r-i, \; t_{l+r-i})$. Первая величина равна длине, дальше которой произошел бы выход за пределы $s_{l:r}$, а вторая — значению радиуса в позиции, зеркальной относительно центра палиндрома $s_{l:r}$.
 
 ```c++
 
diff --git a/content/russian/cs/string-structures/aho-corasick.md b/content/russian/cs/string-structures/aho-corasick.md
index 369f5171..2ca1da65 100644
--- a/content/russian/cs/string-structures/aho-corasick.md
+++ b/content/russian/cs/string-structures/aho-corasick.md
@@ -1,10 +1,11 @@
 ---
 title: Алгоритм Ахо-Корасик
 authors:
-- Сергей Слотин
+  - Сергей Слотин
 weight: 2
 prerequisites:
-- trie
+  - trie
+published: true
 ---
 
 Представим, что мы работаем журналистами в некотором авторитарном государстве, контролирующем СМИ, и в котором время от времени издаются законы, запрещающие упоминать определенные политические события или использовать определенные слова. Как эффективно реализовать подобную цензуру программно?
@@ -36,7 +37,7 @@ prerequisites:
 
 **Определение.** *Суффиксная ссылка* $l(v)$ ведёт в вершину $u \neq v$, которая соответствует наидлиннейшему принимаемому бором суффиксу $v$.
 
-**Определение.** *Автоматный переход* $\delta(v, c)$ ведёт в вершину, соответствующую минимальному принимаемому бором суффиксу строки $v + c$.
+**Определение.** *Автоматный переход* $\delta(v, c)$ ведёт в вершину, соответствующую максимальному принимаемому бором суффиксу строки $v + c$.
 
 **Наблюдение.** Если переход и так существует в боре (будем называть такой переход *прямым*), то автоматный переход будет вести туда же.
 
diff --git a/content/russian/cs/string-structures/palindromic-tree.md b/content/russian/cs/string-structures/palindromic-tree.md
index 3d70c76b..9b57534a 100644
--- a/content/russian/cs/string-structures/palindromic-tree.md
+++ b/content/russian/cs/string-structures/palindromic-tree.md
@@ -19,7 +19,7 @@ weight: 3
 
 Будем поддерживать наибольший суффикс-палиндром. Когда мы будем дописывать очередной символ $c$, нужно найти наибольший суффикс этого палиндрома, который может быть дополнен символом $c$ — это и будет новый наидлиннейший суффикс-палиндром.
 
-Для этого поступим аналогично [алгоритму Ахо-Корасик](aho-corasick): будем поддерживать для каждого палиндрома суффиксную ссылку $l(v)$, ведущую из $v$ в её наибольший суффикс-палиндром. При добавлении очередного символа, будем подниматься по суффиксным ссылкам, пока не найдём вершину, из которой можно совершить нужный переход.
+Для этого поступим аналогично [алгоритму Ахо-Корасик](../aho-corasick): будем поддерживать для каждого палиндрома суффиксную ссылку $l(v)$, ведущую из $v$ в её наибольший суффикс-палиндром. При добавлении очередного символа, будем подниматься по суффиксным ссылкам, пока не найдём вершину, из которой можно совершить нужный переход.
 
 Если в подходящей вершине этого перехода не существовало, то нужно создать новую вершину, и для неё тоже понадобится своя суффиксная ссылка. Чтобы найти её, будем продолжать подниматься по суффиксным ссылкам предыдущего суффикс-палиндрома, пока не найдём второе такое место, которое мы можем дополнить символом $c$.
 
diff --git a/content/russian/cs/string-structures/suffix-array.md b/content/russian/cs/string-structures/suffix-array.md
index 80d2b129..a7b90768 100644
--- a/content/russian/cs/string-structures/suffix-array.md
+++ b/content/russian/cs/string-structures/suffix-array.md
@@ -22,7 +22,7 @@ weight: 100
 
 ![Сортировка всех суффиксов строки «mississippi$»](../img/sa-sort.png)
 
-**Где это может быть полезно.** Пусть вы хотите основать ещё один поисковик, и чтобы получить финансирование, вам нужно сделать хоть что-то минимально работающее — хотя бы просто научиться искать по ключевому слову документы, включающие его, а также позиции их вхождения (в 90-е это был бы уже довольно сильный MVP). Простыми алгоритмами — [полиномиальными хешами](/cs/hashing), [z- и префикс-функцией](/cs/string-searching) и даже [Ахо-Корасиком](/cs/automata/aho-corasick) — это сделать быстро нельзя, потому что на каждый раз нужно проходиться по всем данным, а суффиксными структурами — можно.
+**Где это может быть полезно.** Пусть вы хотите основать ещё один поисковик, и чтобы получить финансирование, вам нужно сделать хоть что-то минимально работающее — хотя бы просто научиться искать по ключевому слову документы, включающие его, а также позиции их вхождения (в 90-е это был бы уже довольно сильный MVP). Простыми алгоритмами — [полиномиальными хешами](/cs/hashing), [z- и префикс-функцией](/cs/string-searching) и даже [Ахо-Корасиком](../aho-corasick) — это сделать быстро нельзя, потому что на каждый раз нужно проходиться по всем данным, а суффиксными структурами — можно.
 
 В случае с суффиксным массивом можно сделать следующее: сконкатенировать все строки-документы с каким-нибудь внеалфавитным разделителем (`$`), построить по ним суффиксный массив, а дальше для каждого запроса искать бинарным поиском первый суффикс в суффиксном массиве, который меньше искомого слова, а также последний, который меньше. Все суффиксы между этими двумя будут включать искомую строку как префикс.
 
@@ -132,11 +132,11 @@ vector suffix_array(vector &s) {
 
 Тогда есть мотивация посчитать массив `lcp$` в котором окажутся наибольшие общие префиксы соседних суффиксов, а после как-нибудь считать минимумы на отрезках в этом массиве (например, с помощью [разреженной таблицы](/cs/range-queries/sparse-table)).
 
-Осталось придумать способ быстро посчитать массив `lcp`. Можно воспользоваться идеей из построения суффиксного массива за $O(n \log^2 n)$: с помощью [хешей](hashing) и бинпоиска находить `lcp` для каждой пары соседей. Такой метод работает за $O(n \log n)$, но является не самым удобным и популярным.
+Осталось придумать способ быстро посчитать массив `lcp`. Можно воспользоваться идеей из построения суффиксного массива за $O(n \log^2 n)$: с помощью [хешей](/cs/hashing/polynomial/) и бинпоиска находить `lcp` для каждой пары соседей. Такой метод работает за $O(n \log n)$, но является не самым удобным и популярным.
 
 ### Алгоритм Касаи, Аримуры, Арикавы, Ли, Парка
 
-Алгоритм в реальности называется как угодно, но не исходным способом (*алгоритм Касаи*, *алгоритм пяти корейцев*, и т. д.). Используется для подсчета $lcp$ за линейное время. Автору алгоритм кажется чем-то похожим на [z-функцию](string-searching) по своей идее.
+Алгоритм в реальности называется как угодно, но не исходным способом (*алгоритм Касаи*, *алгоритм пяти корейцев*, и т. д.). Используется для подсчета $lcp$ за линейное время. Автору алгоритм кажется чем-то похожим на [z-функцию](/cs/string-searching/z-function) по своей идее.
 
 **Утверждение.** Пусть мы уже построили суфмасс и посчитали $lcp[i]$. Тогда:
 
diff --git a/content/russian/cs/tree-structures/treap.md b/content/russian/cs/tree-structures/treap.md
index 561280a5..ad11c794 100644
--- a/content/russian/cs/tree-structures/treap.md
+++ b/content/russian/cs/tree-structures/treap.md
@@ -1,19 +1,20 @@
 ---
 title: Декартово дерево
 authors:
-- Сергей Слотин
-date: 2021-08-20
-created: "2018"
+  - Сергей Слотин
+date: 2022-01-22
+created: '2018'
 prerequisites:
-- .
-- ../basic-structures/heap
-- /math/probability/expectation
+  - .
+  - ../basic-structures/heap
+  - /math/probability/expectation
 weight: 1
+published: true
 ---
 
 Рене Декарт (фр. *René Descartes*) — великий французский математик и философ XVII века.
 
-Рене Декарт не является создателем декартова дерева, но он является создателем декартовой системы координат, которую мы все знаем и любим.
+Рене Декарт не является создателем декартова дерева, однако он является создателем декартовой системы координат, которую мы все знаем и любим.
 
 Декартово дерево же определяется и строится так:
 
@@ -88,14 +89,18 @@ $$
 Теперь, чтобы найти матожидание глубины, эти вероятности надо просуммировать:
 
 $$
-E[d_i] = \sum_{j \neq i} p(j, i) = \sum_{j \neq i} \frac{1}{|i-j|+1} \leq \sum_{i=1}^n \frac{2}{n} = O(\log n)
+E[d_i] = \sum_{j \neq i} p(j, i)
+       = \sum_{j \neq i} \frac{1}{|i-j|+1}
+       = \sum_{j < i} \frac{1}{i - j} + \sum_{j > i} \frac{1}{j - i}
+       \leq 2 \cdot \sum_{k=2}^n \frac{1}{k}
+       = O(\log n)
 $$
 
 Перед последним переходом мы получили сумму гармонического ряда.
 
 Примечательно, что ожидаемая глубина вершин зависит от их позиции: вершина из середины должна быть примерно в два раза глубже, чем крайняя.
 
-**Упражнение.** Выведите по аналогии с этим рассуждением асимптотику [quicksort](/cs/sorting/quicksort).
+**Упражнение.** Выведите по аналогии с этим рассуждением асимптотику quicksort.
 
 ## Реализация
 
@@ -194,7 +199,7 @@ struct Node {
 Вместо того, чтобы модифицировать и `merge`, и `split` под наши хотелки, напишем вспомогательную функцию `upd`, которую будем вызывать при обновлении детей вершины:
 
 ```c++
-void sum(Node* v) { return v ? v->sum : 0; }
+int sum(Node* v) { return v ? v->sum : 0; }
 // обращаться по пустому указателю нельзя -- выдаст ошибку
 
 void upd(Node* v) { v->sum = sum(v->l) + sum(v->r) + v->val; }
diff --git a/netlify.toml b/netlify.toml
index 1b5ed16e..fb612037 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -2,7 +2,7 @@
 command = "hugo --gc --minify"
 
 [context.production.environment]
-HUGO_VERSION = "0.87.0"
+HUGO_VERSION = "0.96.0"
 HUGO_ENV = "production"
 HUGO_ENABLEGITINFO = "true"
 
@@ -10,20 +10,20 @@ HUGO_ENABLEGITINFO = "true"
 command = "hugo --gc --minify --enableGitInfo"
 
 [context.split1.environment]
-HUGO_VERSION = "0.87.0"
+HUGO_VERSION = "0.96.0"
 HUGO_ENV = "production"
 
 [context.deploy-preview]
 command = "hugo --gc --minify --buildFuture -b $DEPLOY_PRIME_URL"
 
 [context.deploy-preview.environment]
-HUGO_VERSION = "0.87.0"
+HUGO_VERSION = "0.96.0"
 
 [context.branch-deploy]
 command = "hugo --gc --minify -b $DEPLOY_PRIME_URL"
 
 [context.branch-deploy.environment]
-HUGO_VERSION = "0.87.0"
+HUGO_VERSION = "0.96.0"
 
 [context.next.environment]
 HUGO_ENABLEGITINFO = "true"
diff --git a/scripts/check-links.sh b/scripts/check-links.sh
new file mode 100644
index 00000000..9f87cefd
--- /dev/null
+++ b/scripts/check-links.sh
@@ -0,0 +1,2 @@
+# hugo serve
+wget --spider -r -nd -nv http://localhost:1313/
diff --git a/scripts/list-files.sh b/scripts/list-files.sh
new file mode 100644
index 00000000..47259b5c
--- /dev/null
+++ b/scripts/list-files.sh
@@ -0,0 +1 @@
+find ./ -type f -name "*.md" -exec wc {} +
diff --git a/themes/algorithmica/assets/dark.sass b/themes/algorithmica/assets/dark.sass
index c26997ba..b5a53b28 100644
--- a/themes/algorithmica/assets/dark.sass
+++ b/themes/algorithmica/assets/dark.sass
@@ -1,24 +1,22 @@
-$font-color: rgb(206, 177, 150)
-$background: black
-$borders: 1px solid #d4ae8d
+$font-color: #DDD
+$background: #222
+$borders: 1px solid rgb(57, 57, 57)
 
-$code-background: #222
-$code-border: 1px solid #333
-$quote-line-color: 0.25em #d4ae8d solid
+$code-background: #333
+$code-border: 1px solid #444
+$quote-line-color: 0.25em #444 solid
 
-$dimmed: #cea163
-$section-headers: #c77d0f
-$headers-color: rgb(200, 160, 130)
+$dimmed: rgb(179, 179, 179)
+$section-headers: rgb(239, 239, 239)
+$headers-color: rgb(239, 239, 239)
 $scrollbar1: #444
 $scrollbar2: #555
 $scrollbar3: #666
 
-$link-color: #ac7625
-$link-hover-color: #eb9a20
+$link-color: #80acd3
+$link-hover-color: #5490c5
 
 @import style.sass
 
 img
-  //filter: invert(100%) sepia(100%) saturate(0%) hue-rotate(288deg) brightness(102%) contrast(102%)
-  filter: invert(100%) sepia(20%) saturate(36.4%) hue-rotate(29deg) brightness(85%)
-  
\ No newline at end of file
+  filter: invert(85%) sepia(20%) saturate(100%) hue-rotate(29deg) brightness(85%)
diff --git a/themes/algorithmica/assets/style.sass b/themes/algorithmica/assets/style.sass
index 9b05bd35..00a420cf 100644
--- a/themes/algorithmica/assets/style.sass
+++ b/themes/algorithmica/assets/style.sass
@@ -53,6 +53,10 @@ $link-active-color: $link-hover-color//#faa700 !default
   font-family: "Crimson"
   src: url(fonts/crimson.ttf)
 
+//@font-face
+//  font-family: "Linux Libertine"
+//  src: url(fonts/linux-libertine.ttf)
+
 /* layout */
 html, body
   margin: 0
@@ -153,6 +157,11 @@ body
       &::before
         content: counter(chapter-counter) "." counter(section-counter) ". "
         font-weight: bold
+  
+  .draft, .draft a
+    color: $dimmed
+
+    
 
 #wrapper
   width: 100%
@@ -178,10 +187,10 @@ menu
   display: flex
   font-family: $font-headings
   
-  height: 30px
+  height: 26px
   background-color: $background
   justify-content: space-between
-  padding: 12px
+  padding: 14px
   margin: 0
   text-align: center
 
@@ -213,7 +222,37 @@ menu
     .title
       opacity: 1
       transition: opacity 0.1s
-    
+
+#search
+  display: none
+  font-family: $font-interface
+
+  input
+    width: 100%
+    padding: 6px
+
+    color: $font-color
+
+    background: $code-background
+    border: $code-border
+
+    &:focus
+      outline: 1px solid $dimmed
+
+  #search-count
+    margin-top: 8px
+    color: $dimmed
+  
+  #search-results
+    margin-top: 6px
+    border-bottom: $borders
+
+    li
+      list-style: none
+      margin: 12px 6px
+
+    p
+      margin-top: 0
 
 /*
   .github
@@ -257,7 +296,7 @@ main
   min-width: 500px
   max-width: 850px
   margin: auto
-  padding: 6px 12px
+  padding: 6px 18px
 
   // so that the footer is stuck to bottom even if the page is short:
   min-height: calc(100vh - 168px)
@@ -278,7 +317,7 @@ article
     img
       dispaly: block
       max-width: 90%
-      max-height: 400px
+      max-height: 500px
       margin-bottom: 4px
     
     figcaption
@@ -394,7 +433,7 @@ footer
   font-family: $font-interface
 
 .katex
-  font-size: 1.15em !important
+  font-size: 1.1em !important
 
 /* headers */
 h1, h2, h3, h4, h5, h6
@@ -455,7 +494,14 @@ pre
   padding: 2px
   padding-left: 8px
   font-size: 0.85em
-  
+  text-align: left
+
+pre.center-pre
+  text-align: center
+  font-size: 1em
+  background: none
+  border: none
+
 .highlight
   margin: 0px
 
diff --git a/themes/algorithmica/i18n/en.toml b/themes/algorithmica/i18n/en.toml
index d58a7924..6fa12340 100644
--- a/themes/algorithmica/i18n/en.toml
+++ b/themes/algorithmica/i18n/en.toml
@@ -15,6 +15,15 @@ other = "updated"
 [sections]
 other = "sections"
 
+[search]
+other = "Search this book…"
+
+[searchCountPrefix]
+other = "Found"
+
+[searchCountSuffix]
+other = "pages"
+
 [prerequisites]
 other = "prerequisites"
 
@@ -22,7 +31,7 @@ other = "prerequisites"
 other = "translations"
 
 [copyright1]
-other = "Copyright 2021 Sergey Slotin"
+other = "Copyright 2021–2022 Sergey Slotin"
 
 [copyright2]
 other = " " # Content is distributed under CC BY-NC
diff --git a/themes/algorithmica/i18n/ru.toml b/themes/algorithmica/i18n/ru.toml
index a25a0c27..08d47b66 100644
--- a/themes/algorithmica/i18n/ru.toml
+++ b/themes/algorithmica/i18n/ru.toml
@@ -21,6 +21,15 @@ other = "обновлено"
 [sections]
 other = "статьи раздела"
 
+[search]
+other = "Поиск по сайту…"
+
+[searchCountPrefix]
+other = "Найдено"
+
+[searchCountSuffix]
+other = "страниц"
+
 [prerequisites]
 other = "пререквизиты"
 
@@ -28,7 +37,7 @@ other = "пререквизиты"
 other = "переводы"
 
 [copyright1]
-other = "Copyleft 2017–2021 Тинькофф Образование" # {{ .Count / . }}
+other = "Copyleft 2017–2022 Algorithmica.org" # {{ .Count / . }}
 
 [copyright2]
 other = "Материалы распространяются под CC BY-SA"
diff --git a/themes/algorithmica/layouts/_default/_markup/render-codeblock-center.html b/themes/algorithmica/layouts/_default/_markup/render-codeblock-center.html
new file mode 100644
index 00000000..d263bb5a
--- /dev/null
+++ b/themes/algorithmica/layouts/_default/_markup/render-codeblock-center.html
@@ -0,0 +1,3 @@
+
+{{.Inner}}
+
diff --git a/themes/algorithmica/layouts/_default/baseof.html b/themes/algorithmica/layouts/_default/baseof.html index f9056521..dbe71ede 100644 --- a/themes/algorithmica/layouts/_default/baseof.html +++ b/themes/algorithmica/layouts/_default/baseof.html @@ -6,6 +6,7 @@
{{- partial "buttons.html" . -}}
+ {{ partial "search.html" . }} {{- partial "header.html" . -}}
{{- block "main" . }}{{- end }} diff --git a/themes/algorithmica/layouts/_default/list.searchindex.json b/themes/algorithmica/layouts/_default/list.searchindex.json new file mode 100644 index 00000000..6310c263 --- /dev/null +++ b/themes/algorithmica/layouts/_default/list.searchindex.json @@ -0,0 +1,5 @@ +{{- $.Scratch.Add "searchindex" slice -}} +{{- range $index, $element := .Site.Pages -}} + {{- $.Scratch.Add "searchindex" (dict "id" $index "title" $element.Title "path" $element.RelPermalink "content" $element.Plain) -}} +{{- end -}} +{{- $.Scratch.Get "searchindex" | jsonify -}} diff --git a/themes/algorithmica/layouts/partials/buttons.html b/themes/algorithmica/layouts/partials/buttons.html index ce9d5728..265b63d9 100644 --- a/themes/algorithmica/layouts/partials/buttons.html +++ b/themes/algorithmica/layouts/partials/buttons.html @@ -3,16 +3,21 @@ {{ with .File }}{{ $path = .Path }}{{ end }}
{{.Title}}
@@ -20,7 +25,9 @@ - + diff --git a/themes/algorithmica/layouts/partials/head.html b/themes/algorithmica/layouts/partials/head.html index 55c6d380..c5013dba 100644 --- a/themes/algorithmica/layouts/partials/head.html +++ b/themes/algorithmica/layouts/partials/head.html @@ -10,6 +10,11 @@ + + + + + {{ $dark := resources.Get "dark.sass" | toCSS | minify | fingerprint }} @@ -18,22 +23,101 @@ console.log("Toggling sidebar visibility") var sidebar = document.getElementById('sidebar') var wrapper = document.getElementById('wrapper') - if (sidebar.classList.contains('sidebar-toggled') || window.getComputedStyle(sidebar).display == 'block') { + if (sidebar.classList.contains('sidebar-toggled') || window.getComputedStyle(sidebar).display == 'block') { sidebar.classList.toggle('sidebar-hidden') wrapper.classList.toggle('sidebar-hidden') } sidebar.classList.add('sidebar-toggled') wrapper.classList.add('sidebar-toggled') } + function switchTheme(theme) { console.log("Changing theme:", theme) document.getElementById('theme').href = (theme == 'dark' ? "{{ $dark.RelPermalink }}" : "") document.getElementById('syntax-theme').href = (theme == 'dark' ? '/syntax-dark.css' : '/syntax.css') localStorage.setItem('theme', theme) } + + async function toggleSearch() { + console.log("Toggling search") + + var searchDiv = document.getElementById('search') + if (window.getComputedStyle(searchDiv).display == 'none') { + searchDiv.style.display = 'block' + window.scrollTo({ top: 0 }); + document.getElementById('search-bar').focus() + } else { + searchDiv.style.display = 'none' + } + + if (!index) { + console.log("Fetching index") + const response = await fetch('/searchindex.json') + const pages = await response.json() + index = lunr(function() { + this.use(lunr.multiLanguage('en', 'ru')) + this.field('title', { + boost: 5 + }) + this.field('content', { + boost: 1 + }) + pages.forEach(function(doc) { + this.add(doc) + articles.push(doc) + }, this) + }) + console.log("Ready to search") + } + } + + var articles = [] + var index = undefined + + function search() { + var query = document.getElementById('search-bar').value + var resultsDiv = document.getElementById('search-results') + var countDiv = document.getElementById('search-count') + + if (query == '') { + resultsDiv.innerHTML = '' + countDiv.innerHTML = '' + return + } + + var results = index.search(query) + + countDiv.innerHTML = '{{ T "searchCountPrefix" }} ' + results.length + ' {{ T "searchCountSuffix" }}' + + let resultList = '' + + for (const n in results) { + const item = articles[results[n].ref] + resultList += '
  • ' + item.title + '

    ' + const text = item.content + + const contextLimit = 80 + + if (text.includes(query)) { + const start = text.indexOf(query) + if (start > contextLimit) + resultList += '…' + resultList += text.substring(start - contextLimit, start) + + '' + query + '' + text.substring(start + query.length, start + query.length + contextLimit) + + } else { + resultList += text.substring(0, contextLimit * 2) + } + resultList += '…

  • ' + } + + resultsDiv.innerHTML = resultList + } + if (localStorage.getItem('theme') == 'dark') { switchTheme('dark') } + window.addEventListener('load', function() { var el = document.getElementById("active-element") //console.log(el) @@ -46,6 +130,7 @@ toggleSidebar() }*/ }) + window.addEventListener('scroll', function() { var menu = document.getElementById('menu') if (window.scrollY < 120) { @@ -56,8 +141,10 @@ menu.classList.add('scrolled') } }) - // onkeypress didn't work with arrows for some reasons + window.addEventListener('keydown', function(e) { + if (e.altKey) { return } + if (document.activeElement.tagName == 'INPUT') { return } if (e.key == 'ArrowLeft') { document.getElementById('prev-article').click() } else if (e.key == 'ArrowRight') { diff --git a/themes/algorithmica/layouts/partials/search.html b/themes/algorithmica/layouts/partials/search.html new file mode 100644 index 00000000..ee853dfa --- /dev/null +++ b/themes/algorithmica/layouts/partials/search.html @@ -0,0 +1,6 @@ + diff --git a/themes/algorithmica/layouts/partials/sidebar.html b/themes/algorithmica/layouts/partials/sidebar.html index 2276957a..652a1f1b 100644 --- a/themes/algorithmica/layouts/partials/sidebar.html +++ b/themes/algorithmica/layouts/partials/sidebar.html @@ -24,13 +24,13 @@ {{ if isset .Params "part" }}
  • {{.Params.Part}}
  • {{ end }} -
  • {{ .Title }}
  • {{ if .IsSection }}
      {{ range .Pages }} -
    1. {{ .Title }}
    2. {{ end }} diff --git a/themes/algorithmica/static/fonts/linux-libertine.ttf b/themes/algorithmica/static/fonts/linux-libertine.ttf new file mode 100644 index 00000000..ab154440 Binary files /dev/null and b/themes/algorithmica/static/fonts/linux-libertine.ttf differ diff --git a/themes/algorithmica/static/scripts/lunr.multi.min.js b/themes/algorithmica/static/scripts/lunr.multi.min.js new file mode 100644 index 00000000..6f417304 --- /dev/null +++ b/themes/algorithmica/static/scripts/lunr.multi.min.js @@ -0,0 +1 @@ +!function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){e.multiLanguage=function(){for(var t=Array.prototype.slice.call(arguments),i=t.join("-"),r="",n=[],s=[],p=0;p=W.limit)return!1;W.cursor++}return!0}function t(){for(;!W.out_grouping(S,1072,1103);){if(W.cursor>=W.limit)return!1;W.cursor++}return!0}function w(){b=W.limit,_=b,e()&&(b=W.cursor,t()&&e()&&t()&&(_=W.cursor))}function i(){return _<=W.cursor}function u(e,n){var r,t;if(W.ket=W.cursor,r=W.find_among_b(e,n)){switch(W.bra=W.cursor,r){case 1:if(t=W.limit-W.cursor,!W.eq_s_b(1,"а")&&(W.cursor=W.limit-t,!W.eq_s_b(1,"я")))return!1;case 2:W.slice_del()}return!0}return!1}function o(){return u(h,9)}function s(e,n){var r;return W.ket=W.cursor,!!(r=W.find_among_b(e,n))&&(W.bra=W.cursor,1==r&&W.slice_del(),!0)}function c(){return s(g,26)}function m(){return!!c()&&(u(C,8),!0)}function f(){return s(k,2)}function l(){return u(P,46)}function a(){s(v,36)}function p(){var e;W.ket=W.cursor,(e=W.find_among_b(F,2))&&(W.bra=W.cursor,i()&&1==e&&W.slice_del())}function d(){var e;if(W.ket=W.cursor,e=W.find_among_b(q,4))switch(W.bra=W.cursor,e){case 1:if(W.slice_del(),W.ket=W.cursor,!W.eq_s_b(1,"н"))break;W.bra=W.cursor;case 2:if(!W.eq_s_b(1,"н"))break;case 3:W.slice_del()}}var _,b,h=[new n("в",-1,1),new n("ив",0,2),new n("ыв",0,2),new n("вши",-1,1),new n("ивши",3,2),new n("ывши",3,2),new n("вшись",-1,1),new n("ившись",6,2),new n("ывшись",6,2)],g=[new n("ее",-1,1),new n("ие",-1,1),new n("ое",-1,1),new n("ые",-1,1),new n("ими",-1,1),new n("ыми",-1,1),new n("ей",-1,1),new n("ий",-1,1),new n("ой",-1,1),new n("ый",-1,1),new n("ем",-1,1),new n("им",-1,1),new n("ом",-1,1),new n("ым",-1,1),new n("его",-1,1),new n("ого",-1,1),new n("ему",-1,1),new n("ому",-1,1),new n("их",-1,1),new n("ых",-1,1),new n("ею",-1,1),new n("ою",-1,1),new n("ую",-1,1),new n("юю",-1,1),new n("ая",-1,1),new n("яя",-1,1)],C=[new n("ем",-1,1),new n("нн",-1,1),new n("вш",-1,1),new n("ивш",2,2),new n("ывш",2,2),new n("щ",-1,1),new n("ющ",5,1),new n("ующ",6,2)],k=[new n("сь",-1,1),new n("ся",-1,1)],P=[new n("ла",-1,1),new n("ила",0,2),new n("ыла",0,2),new n("на",-1,1),new n("ена",3,2),new n("ете",-1,1),new n("ите",-1,2),new n("йте",-1,1),new n("ейте",7,2),new n("уйте",7,2),new n("ли",-1,1),new n("или",10,2),new n("ыли",10,2),new n("й",-1,1),new n("ей",13,2),new n("уй",13,2),new n("л",-1,1),new n("ил",16,2),new n("ыл",16,2),new n("ем",-1,1),new n("им",-1,2),new n("ым",-1,2),new n("н",-1,1),new n("ен",22,2),new n("ло",-1,1),new n("ило",24,2),new n("ыло",24,2),new n("но",-1,1),new n("ено",27,2),new n("нно",27,1),new n("ет",-1,1),new n("ует",30,2),new n("ит",-1,2),new n("ыт",-1,2),new n("ют",-1,1),new n("уют",34,2),new n("ят",-1,2),new n("ны",-1,1),new n("ены",37,2),new n("ть",-1,1),new n("ить",39,2),new n("ыть",39,2),new n("ешь",-1,1),new n("ишь",-1,2),new n("ю",-1,2),new n("ую",44,2)],v=[new n("а",-1,1),new n("ев",-1,1),new n("ов",-1,1),new n("е",-1,1),new n("ие",3,1),new n("ье",3,1),new n("и",-1,1),new n("еи",6,1),new n("ии",6,1),new n("ами",6,1),new n("ями",6,1),new n("иями",10,1),new n("й",-1,1),new n("ей",12,1),new n("ией",13,1),new n("ий",12,1),new n("ой",12,1),new n("ам",-1,1),new n("ем",-1,1),new n("ием",18,1),new n("ом",-1,1),new n("ям",-1,1),new n("иям",21,1),new n("о",-1,1),new n("у",-1,1),new n("ах",-1,1),new n("ях",-1,1),new n("иях",26,1),new n("ы",-1,1),new n("ь",-1,1),new n("ю",-1,1),new n("ию",30,1),new n("ью",30,1),new n("я",-1,1),new n("ия",33,1),new n("ья",33,1)],F=[new n("ост",-1,1),new n("ость",-1,1)],q=[new n("ейше",-1,1),new n("н",-1,2),new n("ейш",-1,1),new n("ь",-1,3)],S=[33,65,8,232],W=new r;this.setCurrent=function(e){W.setCurrent(e)},this.getCurrent=function(){return W.getCurrent()},this.stem=function(){return w(),W.cursor=W.limit,!(W.cursor=i&&(e-=i,t[e>>3]&1<<(7&e)))return this.cursor++,!0}return!1},in_grouping_b:function(t,i,s){if(this.cursor>this.limit_backward){var e=r.charCodeAt(this.cursor-1);if(e<=s&&e>=i&&(e-=i,t[e>>3]&1<<(7&e)))return this.cursor--,!0}return!1},out_grouping:function(t,i,s){if(this.cursors||e>3]&1<<(7&e)))return this.cursor++,!0}return!1},out_grouping_b:function(t,i,s){if(this.cursor>this.limit_backward){var e=r.charCodeAt(this.cursor-1);if(e>s||e>3]&1<<(7&e)))return this.cursor--,!0}return!1},eq_s:function(t,i){if(this.limit-this.cursor>1),f=0,l=o0||e==s||c)break;c=!0}}for(;;){var _=t[s];if(o>=_.s_size){if(this.cursor=n+_.s_size,!_.method)return _.result;var b=_.method();if(this.cursor=n+_.s_size,b)return _.result}if((s=_.substring_i)<0)return 0}},find_among_b:function(t,i){for(var s=0,e=i,n=this.cursor,u=this.limit_backward,o=0,h=0,c=!1;;){for(var a=s+(e-s>>1),f=0,l=o=0;m--){if(n-l==u){f=-1;break}if(f=r.charCodeAt(n-1-l)-_.s[m])break;l++}if(f<0?(e=a,h=l):(s=a,o=l),e-s<=1){if(s>0||e==s||c)break;c=!0}}for(;;){var _=t[s];if(o>=_.s_size){if(this.cursor=n-_.s_size,!_.method)return _.result;var b=_.method();if(this.cursor=n-_.s_size,b)return _.result}if((s=_.substring_i)<0)return 0}},replace_s:function(t,i,s){var e=s.length-(i-t),n=r.substring(0,t),u=r.substring(i);return r=n+s+u,this.limit+=e,this.cursor>=i?this.cursor+=e:this.cursor>t&&(this.cursor=t),e},slice_check:function(){if(this.bra<0||this.bra>this.ket||this.ket>this.limit||this.limit>r.length)throw"faulty slice operation"},slice_from:function(r){this.slice_check(),this.replace_s(this.bra,this.ket,r)},slice_del:function(){this.slice_from("")},insert:function(r,t,i){var s=this.replace_s(r,t,i);r<=this.bra&&(this.bra+=s),r<=this.ket&&(this.ket+=s)},slice_to:function(){return this.slice_check(),r.substring(this.bra,this.ket)},eq_v_b:function(r){return this.eq_s_b(r.length,r)}}}},r.trimmerSupport={generateTrimmer:function(r){var t=new RegExp("^[^"+r+"]+"),i=new RegExp("[^"+r+"]+$");return function(r){return"function"==typeof r.update?r.update(function(r){return r.replace(t,"").replace(i,"")}):r.replace(t,"").replace(i,"")}}}}}); \ No newline at end of file