diff --git a/backend-simulators/pim/pim-simulator/Cargo.lock b/backend-simulators/pim/pim-simulator/Cargo.lock index 339ce39..7828249 100644 --- a/backend-simulators/pim/pim-simulator/Cargo.lock +++ b/backend-simulators/pim/pim-simulator/Cargo.lock @@ -1030,6 +1030,15 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +[[package]] +name = "libmimalloc-sys" +version = "0.1.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d1eacfa31c33ec25e873c136ba5669f00f9866d0688bea7be4d3f7e43067df6" +dependencies = [ + "cc", +] + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -1095,6 +1104,15 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +[[package]] +name = "mimalloc" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3627c4272df786b9260cabaa46aec1d59c93ede723d4c3ef646c503816b0640" +dependencies = [ + "libmimalloc-sys", +] + [[package]] name = "mime" version = "0.3.17" @@ -1414,6 +1432,7 @@ dependencies = [ "faer-traits", "glob", "hex", + "mimalloc", "paste", "plotly", "rayon", diff --git a/backend-simulators/pim/pim-simulator/Cargo.toml b/backend-simulators/pim/pim-simulator/Cargo.toml index 425bb31..7e8df3a 100644 --- a/backend-simulators/pim/pim-simulator/Cargo.toml +++ b/backend-simulators/pim/pim-simulator/Cargo.toml @@ -1,4 +1,3 @@ - [package] name = "pim-simulator" version = "0.1.0" @@ -34,3 +33,4 @@ plotly = {version="0.8", optional=true} rayon = "1.12.0" faer = "0.24.0" faer-traits = "0.24.0" +mimalloc = "0.1.50" diff --git a/backend-simulators/pim/pim-simulator/src/bin/pim-simulator/main.rs b/backend-simulators/pim/pim-simulator/src/bin/pim-simulator/main.rs index b520da9..ad2371c 100644 --- a/backend-simulators/pim/pim-simulator/src/bin/pim-simulator/main.rs +++ b/backend-simulators/pim/pim-simulator/src/bin/pim-simulator/main.rs @@ -1,3 +1,8 @@ +use mimalloc::MiMalloc; + +#[global_allocator] +static GLOBAL: MiMalloc = MiMalloc; + use anyhow::{Context, Result, bail}; use clap::Parser; use glob::glob; @@ -8,8 +13,8 @@ use pimcore::memory_manager::CoreMemory; use pimcore::tracing::TRACER; use serde_json::Value; use std::collections::HashMap; -use std::fs::{self, read_link}; -use std::io::Write; +use std::fs::{self, File, read_link}; +use std::io::{BufReader, Write}; use std::path::PathBuf; /// Program to simulate core execution configuration @@ -45,13 +50,17 @@ fn main() -> Result<()> { let args = Args::parse(); let config_json = retrive_config(&args)?; - let core_inputs = retrive_cores(&args)?; + let mut core_inputs = retrive_cores(&args)?; let memory = retrive_memory(&args)?; let global_crossbars = get_crossbars(&config_json, &args).unwrap(); let crossbars = map_crossbars_to_cores(&config_json, &args, &global_crossbars); - let mut executor = match &core_inputs { - CoreInputs::Json(core_jsons) => json_to_executor::json_to_executor(config_json, core_jsons.iter(), crossbars), - CoreInputs::Binary(core_bins) => binary_to_executor(config_json, core_bins.iter(), crossbars)?, + let mut executor = match &mut core_inputs { + CoreInputs::Json(core_jsons) => { + json_to_executor::json_to_executor(config_json, core_jsons, crossbars) + } + CoreInputs::Binary(core_bins) => { + binary_to_executor(config_json, core_bins.iter(), crossbars)? + } }; set_memory(&mut executor, memory); TRACER @@ -143,8 +152,7 @@ fn get_crossbars(config: &Value, args: &Args) -> anyhow::Result Result> { } enum CoreInputs { - Json(Vec), + Json(Vec>), Binary(Vec>), } @@ -239,15 +247,13 @@ fn retrive_cores(args: &Args) -> Result { } return Ok(CoreInputs::Binary(core_bins)); } - let mut core_jsons: Vec = Vec::with_capacity(cores_override.len()); + let mut core_jsons_reader: Vec> = Vec::with_capacity(cores_override.len()); for core in cores_override { - let content = fs::read_to_string(core) - .with_context(|| format!("Failed to read core file: {:?}", cores_override))?; - let json: Value = - serde_json::from_str(&content).context("Failed to parse core json override")?; - core_jsons.push(json); + let file = File::open(core)?; + let reader = BufReader::new(file); + core_jsons_reader.push(reader); } - return Ok(CoreInputs::Json(core_jsons)); + return Ok(CoreInputs::Json(core_jsons_reader)); } if let Some(folder) = args.folder.as_ref() { @@ -275,15 +281,13 @@ fn retrive_cores(args: &Args) -> Result { bail!("No core*.pim or core*.json files found in {:?}", folder); } - let mut core_jsons: Vec = Vec::with_capacity(json_paths.len()); + let mut core_json_reader: Vec> = Vec::with_capacity(json_paths.len()); for path in json_paths { - let content = fs::read_to_string(&path) - .with_context(|| format!("Failed to read core file: {:?}", path))?; - let json: Value = serde_json::from_str(&content) - .with_context(|| format!("Failed to parse JSON in {:?}", path))?; - core_jsons.push(json); + let file = File::open(path)?; + let reader = BufReader::new(file); + core_json_reader.push(reader); } - return Ok(CoreInputs::Json(core_jsons)); + return Ok(CoreInputs::Json(core_json_reader)); } bail!("Either --core or --folder must be provided to find core definitions."); diff --git a/backend-simulators/pim/pim-simulator/src/lib/instruction_set/isa.rs b/backend-simulators/pim/pim-simulator/src/lib/instruction_set/isa.rs index 28f1269..ca84b9a 100644 --- a/backend-simulators/pim/pim-simulator/src/lib/instruction_set/isa.rs +++ b/backend-simulators/pim/pim-simulator/src/lib/instruction_set/isa.rs @@ -16,7 +16,7 @@ use anyhow::{Context, Result, ensure}; use rayon::prelude::*; use paste::paste; -use std::{borrow::Cow, cell::OnceCell, collections::HashMap}; +use std::{borrow::Cow, cell::OnceCell, collections::HashMap }; use std::{collections::HashSet, sync::LazyLock}; macro_rules! add_name { @@ -35,7 +35,7 @@ macro_rules! add_name_simd { }; } -static NAMES: LazyLock> = LazyLock::new(|| { +pub static NAMES: LazyLock> = LazyLock::new(|| { let mut hash = HashMap::new(); add_name!(hash, sldi); add_name!(hash, sld); @@ -81,6 +81,7 @@ pub fn functor_to_name(functor: usize) -> &'static str { /////////////////////////////////////////////////////////////// /////////////////Scalar/register Instructions////////////////// /////////////////////////////////////////////////////////////// +#[inline(never)] pub fn sldi(cores: &mut CPU, data: InstructionData) -> Result { TRACER.lock().unwrap().pre_sldi(cores, data); let (core_indx, rd, imm) = data.get_core_rd_imm(); @@ -90,6 +91,7 @@ pub fn sldi(cores: &mut CPU, data: InstructionData) -> Result Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn sld(cores: &mut CPU, data: InstructionData) -> Result { TRACER.lock().unwrap().pre_sld(cores, data); let (core_indx, rd, r1) = data.get_core_rd_r1(); @@ -104,6 +106,7 @@ pub fn sld(cores: &mut CPU, data: InstructionData) -> Result Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn sadd(cores: &mut CPU, data: InstructionData) -> Result { TRACER.lock().unwrap().pre_sadd(cores, data); let (core_indx, rd, r1, r2) = data.get_core_rd_r1_r2(); @@ -114,6 +117,7 @@ pub fn sadd(cores: &mut CPU, data: InstructionData) -> Result Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn ssub(cores: &mut CPU, data: InstructionData) -> Result { TRACER.lock().unwrap().pre_ssub(cores, data); let (core_indx, rd, r1, r2) = data.get_core_rd_r1_r2(); @@ -124,6 +128,7 @@ pub fn ssub(cores: &mut CPU, data: InstructionData) -> Result Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn smul(cores: &mut CPU, data: InstructionData) -> Result { TRACER.lock().unwrap().pre_smul(cores, data); let (core_indx, rd, r1, r2) = data.get_core_rd_r1_r2(); @@ -134,6 +139,7 @@ pub fn smul(cores: &mut CPU, data: InstructionData) -> Result Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn saddi(cores: &mut CPU, data: InstructionData) -> Result { TRACER.lock().unwrap().pre_saddi(cores, data); let (core_indx, rd, r1, imm) = data.get_core_rd_r1_imm(); @@ -143,6 +149,7 @@ pub fn saddi(cores: &mut CPU, data: InstructionData) -> Result Result { TRACER.lock().unwrap().pre_smuli(cores, data); let (core_indx, rd, r1, imm) = data.get_core_rd_r1_imm(); @@ -217,14 +224,17 @@ pub fn is_setbw(functor: InstructionType) -> bool { functor as usize == setbw as *const () as usize } +#[inline(never)] pub fn setbw(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, this instruction is resolved in the construction phase"); } +#[inline(never)] pub fn mvmul(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, the real call is the generic version"); } +#[inline(never)] pub(super) fn mvm_impl_internal( cores: &mut CPU, data: InstructionData, @@ -309,6 +319,7 @@ where Ok(InstructionStatus::Completed) } +#[inline(never)] pub(super) fn mvmul_impl(cores: &mut CPU, data: InstructionData) -> Result where [F]: UpcastSlice + UpcastSlice + UpcastSlice, @@ -329,10 +340,12 @@ where } } +#[inline(never)] pub fn vvadd(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, the real call is the generic version"); } +#[inline(never)] pub(super) fn vvadd_impl(cores: &mut CPU, data: InstructionData) -> Result where [F]: UpcastSlice, @@ -371,10 +384,12 @@ where Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn vvsub(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, the real call is the generic version"); } +#[inline(never)] pub(super) fn vvsub_impl(cores: &mut CPU, data: InstructionData) -> Result where [F]: UpcastSlice, @@ -416,6 +431,7 @@ pub fn vvmul(cores: &mut CPU, data: InstructionData) -> Result(cores: &mut CPU, data: InstructionData) -> Result where [F]: UpcastSlice, @@ -452,10 +468,12 @@ where Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn vvdmul(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, the real call is the generic version"); } +#[inline(never)] pub(super) fn vvdmul_impl(cores: &mut CPU, data: InstructionData) -> Result where [F]: UpcastSlice, @@ -488,10 +506,12 @@ where Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn vvmax(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, the real call is the generic version"); } +#[inline(never)] pub(super) fn vvmax_impl(cores: &mut CPU, data: InstructionData) -> Result where [F]: UpcastSlice, @@ -525,22 +545,26 @@ where Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn vvsll(cores: &mut CPU, data: InstructionData) -> Result { panic!( "Shift left on floating point what does it means? who has generated this instruction???" ); } +#[inline(never)] pub fn vvsra(cores: &mut CPU, data: InstructionData) -> Result { panic!( "Shift right on floating point what does it means? who has generated this instruction???" ); } +#[inline(never)] pub fn vavg(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, the real call is the generic version"); } +#[inline(never)] pub(super) fn vavg_impl(cores: &mut CPU, data: InstructionData) -> Result where [F]: UpcastSlice, @@ -570,10 +594,12 @@ where Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn vrelu(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, the real call is the generic version"); } +#[inline(never)] pub(super) fn vrelu_impl(cores: &mut CPU, data: InstructionData) -> Result where [F]: UpcastSlice, @@ -600,10 +626,12 @@ where Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn vtanh(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, the real call is the generic version"); } +#[inline(never)] pub(super) fn vtanh_impl(cores: &mut CPU, data: InstructionData) -> Result where [F]: UpcastSlice, @@ -628,10 +656,12 @@ where Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn vsigm(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, the real call is the generic version"); } +#[inline(never)] pub(super) fn vsigm_impl(cores: &mut CPU, data: InstructionData) -> Result where [F]: UpcastSlice, @@ -654,10 +684,12 @@ where Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn vsoftmax(cores: &mut CPU, data: InstructionData) -> Result { panic!("You are calling a placeholder, the real call is the generic version"); } +#[inline(never)] pub(super) fn vsoftmax_impl( cores: &mut CPU, data: InstructionData, @@ -696,14 +728,17 @@ where Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn vmv(cores: &mut CPU, data: InstructionData) -> Result { todo!() } +#[inline(never)] pub fn vrsu(cores: &mut CPU, data: InstructionData) -> Result { todo!() } +#[inline(never)] pub fn vrsl(cores: &mut CPU, data: InstructionData) -> Result { todo!() } @@ -711,6 +746,7 @@ pub fn vrsl(cores: &mut CPU, data: InstructionData) -> Result /////////////////////////////////////////////////////////////// ///Communication/synchronization Instructions///////////////// /////////////////////////////////////////////////////////////// +#[inline(never)] pub fn ld(cores: &mut CPU, data: InstructionData) -> Result { TRACER.lock().unwrap().pre_ld(cores, data); let (core, rd, r1, _, imm_len, offset_select, offset_value) = @@ -727,6 +763,7 @@ pub fn ld(cores: &mut CPU, data: InstructionData) -> Result { Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn st(cores: &mut CPU, data: InstructionData) -> Result { TRACER.lock().unwrap().pre_st(cores, data); let (core, rd, r1, _, imm_len, offset_select, offset_value) = @@ -743,6 +780,7 @@ pub fn st(cores: &mut CPU, data: InstructionData) -> Result { Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn lldi(cores: &mut CPU, data: InstructionData) -> Result { TRACER.lock().unwrap().pre_lldi(cores, data); let (core, rd, imm) = data.get_core_rd_imm(); @@ -759,6 +797,7 @@ pub fn lldi(cores: &mut CPU, data: InstructionData) -> Result Ok(InstructionStatus::Completed) } +#[inline(never)] pub fn lmv(cores: &mut CPU, data: InstructionData) -> Result { TRACER.lock().unwrap().pre_lmv(cores, data); let (core, rd, r1, _, imm_len, offset_select, offset_value) = @@ -775,18 +814,32 @@ pub fn lmv(cores: &mut CPU, data: InstructionData) -> Result Ok(InstructionStatus::Completed) } +#[inline(never)] +pub fn isa_send(functor : usize) -> bool{ + (send as *const () as usize) == functor +} + +#[inline(never)] pub fn send(cores: &mut CPU, data: InstructionData) -> Result { Ok(InstructionStatus::Sending(data)) } +#[inline(never)] +pub fn isa_recv(functor : usize) -> bool{ + (recv as *const () as usize) == functor +} + +#[inline(never)] pub fn recv(cores: &mut CPU, data: InstructionData) -> Result { Ok(InstructionStatus::Reciving(data)) } +#[inline(never)] pub fn wait(cores: &mut CPU, data: InstructionData) -> Result { Ok(InstructionStatus::Waiting(data)) } +#[inline(never)] pub fn sync(cores: &mut CPU, data: InstructionData) -> Result { Ok(InstructionStatus::Sync(data)) } diff --git a/backend-simulators/pim/pim-simulator/src/lib/instruction_set/mod.rs b/backend-simulators/pim/pim-simulator/src/lib/instruction_set/mod.rs index af6da35..81cf77a 100644 --- a/backend-simulators/pim/pim-simulator/src/lib/instruction_set/mod.rs +++ b/backend-simulators/pim/pim-simulator/src/lib/instruction_set/mod.rs @@ -14,7 +14,7 @@ pub mod helper; #[derive(Clone, Copy, Debug)] pub struct Instruction { pub data: InstructionData, - functor: InstructionType, + pub functor: InstructionType, } #[derive(Debug, Clone, Copy, Default)] diff --git a/backend-simulators/pim/pim-simulator/src/lib/json_to_instruction/json_to_executor.rs b/backend-simulators/pim/pim-simulator/src/lib/json_to_instruction/json_to_executor.rs index f6caad6..9d8dbff 100644 --- a/backend-simulators/pim/pim-simulator/src/lib/json_to_instruction/json_to_executor.rs +++ b/backend-simulators/pim/pim-simulator/src/lib/json_to_instruction/json_to_executor.rs @@ -1,4 +1,5 @@ use serde_json::Value; +use std::{fs::File, io::BufReader}; use crate::{ CoreInstructionsBuilder, Executable, @@ -7,21 +8,22 @@ use crate::{ json_to_instruction::json_isa, }; - pub fn json_to_executor<'a, 'b>( config: Value, - cores: impl Iterator, - crossbars : Vec> + cores: &'b mut Vec>, + crossbars: Vec>, ) -> Executable<'a> { let core_cnt = config.get("core_cnt").unwrap().as_i64().unwrap() as i32; let cpu = CPU::new(core_cnt, crossbars); let mut core_insts_builder = CoreInstructionsBuilder::new(core_cnt as usize); - for (external_core_indx, json_core) in cores.enumerate() { + for (external_core_indx, json_core_reader) in cores.iter_mut().enumerate() { let core_indx = external_core_indx as i32 + 1; let mut insts_builder = InstructionsBuilder::new(); let mut inst_data_builder = InstructionDataBuilder::new(); inst_data_builder.set_core_indx(core_indx).fix_core_indx(); + let json_core: Value = serde_json::from_reader(json_core_reader) + .unwrap_or_else(|err| panic!("failed to parse core{}: {}", external_core_indx, err)); let json_core_insts = json_core .as_array() .unwrap_or_else(|| panic!("core{} has not a list of instruction", external_core_indx)); diff --git a/backend-simulators/pim/pim-simulator/src/lib/pimcore.rs b/backend-simulators/pim/pim-simulator/src/lib/pimcore.rs index b716d6e..89a9866 100644 --- a/backend-simulators/pim/pim-simulator/src/lib/pimcore.rs +++ b/backend-simulators/pim/pim-simulator/src/lib/pimcore.rs @@ -1,16 +1,22 @@ #![allow(unused)] -use std::time::{Duration, SystemTime}; +use std::{ + collections::{HashMap, HashSet}, + time::{Duration, SystemTime}, +}; use crate::{ cpu::CPU, - instruction_set::{Instruction, InstructionStatus, Instructions, isa::functor_to_name}, + instruction_set::{ + Instruction, InstructionStatus, Instructions, + isa::{NAMES, functor_to_name, isa_recv, isa_send}, + }, memory_manager::type_traits::TryToUsize, send_recv::{SendRecv, handle_send_recv}, tracing::TRACER, }; -pub mod cpu; pub mod binary_to_instruction; +pub mod cpu; pub mod instruction_set; pub mod json_to_instruction; pub mod memory_manager; @@ -145,8 +151,9 @@ impl<'a> Executable<'a> { cpu_progressed = 0; *program_counter += 1; } - if (now.elapsed().unwrap() > Duration::from_secs(1)) { - print_status(&cores_instructions); + if (now.elapsed().unwrap() > Duration::from_secs(5)) { + print_status(cores_instructions); + check_cycle(cpu, cores_instructions, send_recv); now = SystemTime::now(); } } @@ -171,7 +178,7 @@ impl<'a> Executable<'a> { } print_status(cores_instructions); - #[cfg(feature = "profile_time")] + #[cfg(feature = "profile_time")] TRACER.lock().unwrap().report(); } @@ -194,6 +201,101 @@ impl<'a> Executable<'a> { } } +fn check_cycle(cpu: &mut CPU, cores_instructions: &[CoreInstructions], send_recv: &mut SendRecv) { + #[derive(Debug, PartialEq, Eq)] + enum CoreState { + SendingTo(i32), + ReceivingFrom(i32), + Working, + Halted, + } + + let mut states = HashMap::new(); + + for core_inst in cores_instructions.iter() { + if core_inst.program_counter >= core_inst.instructions.len() { + continue; + } + + let Instruction { data, functor } = core_inst.instructions[core_inst.program_counter]; + let functor_address = functor as usize; + + let (this_core, target_core) = data.get_core_immcore(); + + if isa_recv(functor_address) { + states.insert(this_core, CoreState::ReceivingFrom(target_core)); + } else if isa_send(functor_address) { + states.insert(this_core, CoreState::SendingTo(target_core)); + } else { + states.insert(this_core, CoreState::Working); + } + } + + let mut wait_for = HashMap::new(); + + for (&core_id, state) in states.iter() { + match state { + CoreState::SendingTo(target_core) => { + let target_state = states.get(target_core).unwrap_or(&CoreState::Halted); + if target_state != &CoreState::ReceivingFrom(core_id) { + wait_for.insert(core_id, *target_core); + } + } + CoreState::ReceivingFrom(target_core) => { + let target_state = states.get(target_core).unwrap_or(&CoreState::Halted); + if target_state != &CoreState::SendingTo(core_id) { + wait_for.insert(core_id, *target_core); + } + } + CoreState::Working | CoreState::Halted => { + } + } + } + + let mut visited = HashSet::new(); + + for &start_core in wait_for.keys() { + if visited.contains(&start_core) { + continue; + } + + let mut path = Vec::new(); + let mut current_core = start_core; + let mut in_path = HashSet::new(); + + while let Some(&waiting_for) = wait_for.get(¤t_core) { + path.push(current_core); + in_path.insert(current_core); + visited.insert(current_core); + + // Found a closed loop! + if in_path.contains(&waiting_for) { + let cycle_start = path.iter().position(|&c| c == waiting_for).unwrap(); + let cycle = &path[cycle_start..]; + + let cycle_str = cycle + .iter() + .map(|c| c.to_string()) + .collect::>() + .join(" -> "); + + let cycle_msg = format!("{} -> {}", cycle_str, waiting_for); + + println!("Fatal: Deadlock cycle detected: {}", cycle_msg); + // bail!("Deadlock detected: {}", cycle_msg); + break; // Stop tracing + } + + // Hit a known branch that didn't result in a cycle + if visited.contains(&waiting_for) { + break; + } + + current_core = waiting_for; + } + } +} + fn handle_wait_sync<'a, 'b, 'c>( cpu: &'b mut CPU<'a>, core_instructions: &'c mut [CoreInstructions],