Skip to content

Commit

Permalink
bugfix: l1 cache is 32kb for each core
Browse files Browse the repository at this point in the history
  • Loading branch information
elinx committed Sep 30, 2021
1 parent 3d4ad11 commit 735c873
Show file tree
Hide file tree
Showing 4 changed files with 228 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ target_link_libraries(openblas lib_openblas)
find_package(Halide)
add_executable(halide_reference halide_reference.cpp)
target_link_libraries(halide_reference Halide::Halide "-lopenblas")

add_executable(cache-latency-checker cache_latency_checker.cpp)
23 changes: 16 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,15 @@ Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtr
- registers: 16 x 256bit
- cache line size: 64bytes(8 FP64)

- Detailed Memory information

| Level | Type | Size | Ways | Sets | Latency |
| :---: | :---------: | :----: | :--: | :---: | :-----: |
| L1 | Data | 32 KB | 8 | 64 |
| L1 | Instruction | 32 KB | 8 | 64 |
| L2 | Data | 256 KB | 4 | 1024 |
| L3 | Data | 9 MB | 12 | 12288 |

- Instruction Timing

| Intrinsics | Instruction | Latency | Throughput |
Expand Down Expand Up @@ -95,16 +104,16 @@ for my hardware in single thread
| K | 640 | | |
| MR | 4 | | |
| NR | 8 | | |
| KC | 320 | | |
| MC | 640 | | |
| NC | 40 | | |
| KC | 256 | | |
| MC | 64 | | |
| NC | 640 | | |
| A | (M, K) | 3276800 | 3.125 MB |
| B | (K, N) | 3276800 | 3.125 MB |
| C | (M, N) | 3276800 | 3.125 MB |
| Ac | (MC, KC) | 1638400 | 1.5625 MB |
| Bc | (KC, NC) | 102400 | 100 KB |
| Ac-Slice | (MR, KC) | 10240 | 10 KB |
| Bc-Slice | (KC, NR) | 20480 | 20 KB |
| Ac | (MC, KC) | 131072 | 128 KB |
| Bc | (KC, NC) | 1310720 | 1.25 MB |
| Ac-Slice | (MR, KC) | 10240 | 8 KB |
| Bc-Slice | (KC, NR) | 20480 | 16 KB |
| Cc | (MR, NR) | 256 | 256 B |

# Benchmarks
Expand Down
206 changes: 206 additions & 0 deletions cache_latency_checker.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
// The following code is adapted from: https://stackoverflow.com/a/21463541/1691873
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>

int i386_cpuid_caches(size_t* data_caches) {
int i;
int num_data_caches = 0;
for (i = 0; i < 32; i++) {
// Variables to hold the contents of the 4 i386 legacy registers
uint32_t eax, ebx, ecx, edx;

eax = 4; // get cache info
ecx = i; // cache id

asm("cpuid" // call i386 cpuid instruction
: "+a"(eax) // contains the cpuid command code, 4 for cache query
,
"=b"(ebx), "+c"(ecx) // contains the cache id
,
"=d"(edx)); // generates output in 4 registers eax, ebx, ecx and edx

// taken from http://download.intel.com/products/processor/manual/325462.pdf Vol. 2A 3-149
int cache_type = eax & 0x1F;

if (cache_type == 0) // end of valid cache identifiers
break;

const char* cache_type_string;
switch (cache_type) {
case 1:
cache_type_string = "Data Cache";
break;
case 2:
cache_type_string = "Instruction Cache";
break;
case 3:
cache_type_string = "Unified Cache";
break;
default:
cache_type_string = "Unknown Type Cache";
break;
}

int cache_level = (eax >>= 5) & 0x7;

int cache_is_self_initializing = (eax >>= 3) & 0x1; // does not need SW initialization
int cache_is_fully_associative = (eax >>= 1) & 0x1;

// taken from http://download.intel.com/products/processor/manual/325462.pdf 3-166 Vol. 2A
// ebx contains 3 integers of 10, 10 and 12 bits respectively
unsigned int cache_sets = ecx + 1;
unsigned int cache_coherency_line_size = (ebx & 0xFFF) + 1;
unsigned int cache_physical_line_partitions = ((ebx >>= 12) & 0x3FF) + 1;
unsigned int cache_ways_of_associativity = ((ebx >>= 10) & 0x3FF) + 1;

// Total cache size is the product
size_t cache_total_size = cache_ways_of_associativity * cache_physical_line_partitions *
cache_coherency_line_size * cache_sets;

if (cache_type == 1 || cache_type == 3) {
data_caches[num_data_caches++] = cache_total_size;
}

printf(
"Cache ID %d:\n"
"- Level: %d\n"
"- Type: %s\n"
"- Sets: %d\n"
"- System Coherency Line Size: %d bytes\n"
"- Physical Line partitions: %d\n"
"- Ways of associativity: %d\n"
"- Total Size: %zu bytes (%zu kb)\n"
"- Is fully associative: %s\n"
"- Is Self Initializing: %s\n"
"\n",
i, cache_level, cache_type_string, cache_sets, cache_coherency_line_size,
cache_physical_line_partitions, cache_ways_of_associativity, cache_total_size,
cache_total_size >> 10, cache_is_fully_associative ? "true" : "false",
cache_is_self_initializing ? "true" : "false");
}

return num_data_caches;
}

int test_cache(size_t attempts, size_t lower_cache_size, size_t* latencies, size_t max_latency) {
int fd = open("/dev/urandom", O_RDONLY);
if (fd < 0) {
perror("open");
abort();
}
char* random_data = (char*)mmap(NULL, lower_cache_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANON // | MAP_POPULATE
,
-1, 0); // get some random data
if (random_data == MAP_FAILED) {
perror("mmap");
abort();
}

size_t i;
for (i = 0; i < lower_cache_size; i += sysconf(_SC_PAGESIZE)) {
random_data[i] = 1;
}

int64_t random_offset = 0;
while (attempts--) {
// use processor clock timer for exact measurement
random_offset += rand();
random_offset %= lower_cache_size;
int32_t cycles_used, edx, temp1, temp2;
asm("mfence\n\t" // memory fence
"rdtsc\n\t" // get cpu cycle count
"mov %%edx, %2\n\t"
"mov %%eax, %3\n\t"
"mfence\n\t" // memory fence
"mov %4, %%al\n\t" // load data
"mfence\n\t"
"rdtsc\n\t"
"sub %2, %%edx\n\t" // substract cycle count
"sbb %3, %%eax" // substract cycle count
: "=a"(cycles_used), "=d"(edx), "=r"(temp1), "=r"(temp2)
: "m"(random_data[random_offset]));
// printf("%d\n", cycles_used);
if (cycles_used < max_latency)
latencies[cycles_used]++;
else
latencies[max_latency - 1]++;
}

munmap(random_data, lower_cache_size);

return 0;
}

int main() {
size_t cache_sizes[32];
int num_data_caches = i386_cpuid_caches(cache_sizes);

size_t latencies[0x400];
memset(latencies, 0, sizeof(latencies));

size_t empty_cycles = 0;

size_t i;
int attempts = 1000000;
for (i = 0; i < attempts; i++) { // measure how much overhead we have for counting cyscles
int32_t cycles_used, edx, temp1, temp2;
asm("mfence\n\t" // memory fence
"rdtsc\n\t" // get cpu cycle count
"mov %%edx, %2\n\t"
"mov %%eax, %3\n\t"
"mfence\n\t" // memory fence
"mfence\n\t"
"rdtsc\n\t"
"sub %2, %%edx\n\t" // substract cycle count
"sbb %3, %%eax" // substract cycle count
: "=a"(cycles_used), "=d"(edx), "=r"(temp1), "=r"(temp2)
:);
if (cycles_used < sizeof(latencies) / sizeof(*latencies))
latencies[cycles_used]++;
else
latencies[sizeof(latencies) / sizeof(*latencies) - 1]++;
}

{
size_t j;
size_t sum = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum += latencies[j];
}
size_t sum2 = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum2 += latencies[j];
if (sum2 >= sum * .75) {
empty_cycles = j;
fprintf(stderr, "Empty counting takes %lu cycles\n", empty_cycles);
break;
}
}
}

for (i = 0; i < num_data_caches; i++) {
test_cache(attempts, cache_sizes[i] * 4, latencies, sizeof(latencies) / sizeof(*latencies));

size_t j;
size_t sum = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum += latencies[j];
}
size_t sum2 = 0;
for (j = 0; j < sizeof(latencies) / sizeof(*latencies); j++) {
sum2 += latencies[j];
if (sum2 >= sum * .75) {
fprintf(stderr, "Cache ID %lu has latency %lu cycles\n", i, j - empty_cycles);
break;
}
}
}

return 0;
}
8 changes: 4 additions & 4 deletions manual_optimize_dgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
#include "halide_benchmark.h"
#include "halide_macros.h"

constexpr uint32_t l1_cache_size = 192 * 1024;
constexpr uint32_t l2_cache_size = 1536 * 1024;
constexpr uint32_t l1_cache_size = 32 * 1024;
constexpr uint32_t l2_cache_size = 256 * 1024;
constexpr uint32_t l3_cache_size = 9 * 1024 * 1024;

enum class MicroKernelType {
Expand Down Expand Up @@ -799,8 +799,8 @@ void manual_dgemm(const double *A, const double *B, double *C, const uint32_t M,
constexpr uint32_t TILE_W = 8;
constexpr uint32_t TILE_K = 320;

constexpr uint32_t m_outer_step = TILE_H * 160;
constexpr uint32_t n_outer_step = TILE_W * 40;
constexpr uint32_t m_outer_step = TILE_H * 16;
constexpr uint32_t n_outer_step = TILE_W * 80;
constexpr uint32_t k_outer_step = TILE_K;

const uint32_t m_outer_bound = (M + m_outer_step - 1) / m_outer_step * m_outer_step;
Expand Down

0 comments on commit 735c873

Please sign in to comment.