From 97973c31e076d19148b00902767adfe1deb45b3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hugo=20M=C3=A5rdbrink?= Date: Sat, 30 Mar 2024 22:49:45 +0100 Subject: [PATCH] Add multicore support for hardware configuration --- Design_and_analysis.md | 22 +++++++++++----- riscv_hw.py | 58 ++++++++++++++++++++++-------------------- 2 files changed, 47 insertions(+), 33 deletions(-) diff --git a/Design_and_analysis.md b/Design_and_analysis.md index c1c5637..c33787c 100644 --- a/Design_and_analysis.md +++ b/Design_and_analysis.md @@ -14,7 +14,7 @@ Thus, the algorithm is used in spacecraft to decrease the amount of image data t Since the environment in space is limited, the design needs to focus on an energy efficient design using a small hardware area. This alters the focus of the codesign to prefer energy efficiency over throughput or execution time. However, the aspect of fast execution times is still highly relevant and a good balance between the two needs to be explored. - +Notably, in current RISC-V space processors there are no vector processing units, making this a interesting aspect. ## Method ### Development and evaluation @@ -24,6 +24,7 @@ For parallelisation, the (OpenMP library)[https://www.openmp.org/] will be used. To test and evaluate the software implementation, it will run in the gem5 simulator. The hardware configuration is also done in configuration files for gem5. The mock data for the images will be generated in C with nonsensical values. This does not matter since different values will not affect the run time. When measuring the performance the sequential time of generating mock data and freeing the memory will be deducted for a true performance reflection. +For the parts where problem size will increase, performance will be measured by cycles per DCT block. ### Building @@ -38,7 +39,7 @@ The following flags will be used based on what functionality is needed: - `-lm` for math library - `-libomp` for OpenMP library - `-O[level]` for different optimisation levels -- `-march=rv64imafcv` for the RISC-V ISA +- `-march=rv64imadcv` for the RISC-V ISA - `-mabi=lp64d` for the RISC-V ABI ### Simulating @@ -51,15 +52,27 @@ The python script for this project is tailored for this project specifically, th - `--l2` for the L2 cache size - `--vlen` for the vector length - `--elen` for the element length +- `--cores` for the number of cores To run the simulation and output the result, the following command is used: ```bash -../gem5/build/RISCV/gem5.opt -d stats/ ./riscv_hw.py --l1i 16kB --l1d 64kB --l2 256kB --vlen 256 --elen 64 +../gem5/build/RISCV/gem5.opt -d stats/ ./riscv_hw.py --l1i 16kB --l1d 64kB --l2 256kB --vlen 256 --elen 64 --cores 1 ``` ## Implementation +### Initial hardware configuration +For the initial and naive software implementation, some hardware configurations are set. These are: +- L1 instruction cache size: 16kB +- L1 data cache size: 64kB +- L2 cache size: 256kB +- Vector length: 256 +- Element length: 64 +- Number of threads: 1 +- L1 cache associativity: 2 +- L2 cache associativity: 8 + ### Constants and definitions Throughout the code, several constants and definitions are defined for ease to try different configurations. These are defined in the following way: - `DCT_SIZE` is the size of the DCT block @@ -75,9 +88,6 @@ This will be done by allocating DCT-blocks heap memory and filling them with dat It's important to actually generate all the data and not reuse the same matrices to get realistic cache hits and misses. The memory allocation is done in the following way: -### Initial hardware configuration - - ```c element_t ***mock_matrices = (element_t ***) malloc(TOTAL_DCT_BLOCKS * sizeof(element_t**)); for (int i = 0; i < TOTAL_DCT_BLOCKS; i++) { diff --git a/riscv_hw.py b/riscv_hw.py index 4aa90ec..4132315 100644 --- a/riscv_hw.py +++ b/riscv_hw.py @@ -1,16 +1,17 @@ import m5 -from m5.objects import System, SrcClockDomain, VoltageDomain, Root -from m5.objects import RiscvO3CPU, Cache, AddrRange, SEWorkload, Process -from m5.objects import MemCtrl, DDR3_1600_8x8, SystemXBar, L2XBar, RiscvISA +from m5.objects import System, SrcClockDomain, VoltageDomain, Root, \ + RiscvO3CPU, Cache, AddrRange, SEWorkload, Process, MemCtrl, \ + DDR3_1600_8x8, SystemXBar, L2XBar, RiscvISA class RiscvHWConfig: - def __init__(self, l1i, l1d, l2, vlen, elen): + def __init__(self, l1i, l1d, l2, vlen, elen, cores): self.l1i = l1i self.l1d = l1d self.l2 = l2 self.vlen = vlen self.elen = elen + self.cores = cores def get_config(): @@ -21,8 +22,10 @@ def get_config(): parser.add_argument('--l2', type=str, default='256kB') parser.add_argument('--vlen', type=int, default=256) parser.add_argument('--elen', type=int, default=64) + parser.add_argument('--cores', type=int, default=1) args = parser.parse_args() - return RiscvHWConfig(args.l1i, args.l1d, args.l2, args.vlen, args.elen) + return RiscvHWConfig(args.l1i, args.l1d, args.l2, + args.vlen, args.elen, args.cores) class L1Cache(Cache): @@ -92,6 +95,24 @@ class L2Cache(Cache): self.mem_side = bus.cpu_side_ports +def createCPU(l2bus, config): + cpu = RiscvO3CPU() + cpu.isa = RiscvISA(enable_rvv=True, vlen=config.vlen, elen=config.elen) + + cpu.icache = L1ICache(config) + cpu.dcache = L1DCache(config) + + cpu.icache.connectCPU(cpu) + cpu.dcache.connectCPU(cpu) + + cpu.icache.connectBus(l2bus) + cpu.dcache.connectBus(l2bus) + + cpu.createInterruptController() + + return cpu + + config = get_config() print(f"l1i size: {config.l1i}") @@ -99,6 +120,7 @@ print(f"l1d size: {config.l1d}") print(f"l2 size: {config.l2}") print(f"vlen size: {config.vlen} bits") print(f"elen size: {config.elen} bits") +print(f"cores: {config.cores}") print("\n") assert config.vlen >= 2 * config.elen, \ @@ -121,35 +143,16 @@ system.clk_domain.voltage_domain = VoltageDomain() system.mem_mode = 'timing' system.mem_ranges = [AddrRange('512MB')] -system.cpu = RiscvO3CPU() -system.cpu.isa = RiscvISA(enable_rvv=True, vlen=config.vlen, elen=config.elen) - -# Create the L1 caches -system.cpu.icache = L1ICache(config) -system.cpu.dcache = L1DCache(config) - -# Connect the caches to the CPU -system.cpu.icache.connectCPU(system.cpu) -system.cpu.dcache.connectCPU(system.cpu) - -# Connect the CPU to the L2 bus system.l2bus = L2XBar() -# Connect the L1 caches to the L2 bus -system.cpu.icache.connectBus(system.l2bus) -system.cpu.dcache.connectBus(system.l2bus) +system.cpu = [createCPU(system.l2bus, config) for _ in range(config.cores)] -# Connect the L2 cache to the CPU side bus system.l2cache = L2Cache(config) system.l2cache.connectCPUSideBus(system.l2bus) -# Connect the L2 cache to the memory bus system.membus = SystemXBar() system.l2cache.connectMemSideBus(system.membus) -# Connect the CPU to the memory bus -system.cpu.createInterruptController() - system.mem_ctrl = MemCtrl() system.mem_ctrl.dram = DDR3_1600_8x8() system.mem_ctrl.dram.range = system.mem_ranges[0] @@ -161,8 +164,9 @@ system.workload = SEWorkload.init_compatible(binary) process = Process() process.cmd = [binary] -system.cpu.workload = process -system.cpu.createThreads() +for cpu in system.cpu: + cpu.workload = process + cpu.createThreads() # Run SE mode root = Root(full_system=False, system=system)