From a64185dcd1fc16c8904fe978482447100b60c665 Mon Sep 17 00:00:00 2001 From: angelayi Date: Tue, 27 Jan 2026 22:15:23 -0800 Subject: [PATCH] Add vllm viewer --- src/lib.rs | 71 +- src/parsers.rs | 8 +- src/vllm/mod.rs | 11 + src/vllm/parsers.rs | 287 + src/vllm/templates.rs | 236 + src/vllm/types.rs | 88 + tests/inputs/vllm_sample.log | 14680 +++++++++++++++++++++++++++++++++ tests/integration_test.rs | 39 + 8 files changed, 15413 insertions(+), 7 deletions(-) create mode 100644 src/vllm/mod.rs create mode 100644 src/vllm/parsers.rs create mode 100644 src/vllm/templates.rs create mode 100644 src/vllm/types.rs create mode 100644 tests/inputs/vllm_sample.log diff --git a/src/lib.rs b/src/lib.rs index 13a15f7..509a735 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,6 +9,7 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use regex::Regex; use serde_json::Value; use std::cell::RefCell; +use std::fmt::Write as FmtWrite; use std::fs::{self, File}; use std::io::{self, BufRead}; use std::path::{Path, PathBuf}; @@ -23,6 +24,7 @@ use crate::types::*; pub mod parsers; mod templates; mod types; +pub mod vllm; pub use types::{ ArtifactFlags, CollectiveSchedule, CollectivesParityReport, Diagnostics, DivergenceFlags, @@ -121,6 +123,7 @@ fn add_file_output( output: &mut ParseOutput, compile_directory: &mut Vec, output_count: &mut i32, + vllm_state: &vllm::VllmState, ) { let is_stack_traces = is_stack_traces_file(&filename); let maybe_content = if is_stack_traces { @@ -130,6 +133,7 @@ fn add_file_output( }; output.push((filename.clone(), content)); let filename_str = filename.to_string_lossy().to_string(); + let suffix = if filename_str.contains("cache_miss") { "❌".to_string() } else if filename_str.contains("cache_hit") { @@ -139,6 +143,10 @@ fn add_file_output( } else { "".to_string() }; + + // Track artifact for vLLM summary + vllm_state.add_artifact(&filename, suffix.clone()); + let readable_url = if let Some(c) = maybe_content { Some(add_stack_traces_html(&filename, &c, output, output_count)) } else { @@ -214,6 +222,7 @@ fn run_parser<'t>( compile_directory: &mut Vec, multi: &MultiProgress, stats: &mut Stats, + vllm_state: &vllm::VllmState, ) -> ParserResult { let mut payload_filename = ParserResult::NoPayload; if let Some(md) = parser.get_metadata(&e) { @@ -224,10 +233,24 @@ fn run_parser<'t>( match parser_result { ParserOutput::File(raw_filename, out) => { let filename = add_unique_suffix(raw_filename, *output_count); - add_file_output(filename, out, output, compile_directory, output_count); + add_file_output( + filename, + out, + output, + compile_directory, + output_count, + vllm_state, + ); } ParserOutput::GlobalFile(filename, out) => { - add_file_output(filename, out, output, compile_directory, output_count); + add_file_output( + filename, + out, + output, + compile_directory, + output_count, + vllm_state, + ); } ParserOutput::PayloadFile(raw_filename) => { let filename = add_unique_suffix(raw_filename, *output_count); @@ -240,6 +263,7 @@ fn run_parser<'t>( output, compile_directory, output_count, + vllm_state, ); } ParserOutput::PayloadReformatFile(raw_filename, formatter) => { @@ -255,6 +279,7 @@ fn run_parser<'t>( output, compile_directory, output_count, + vllm_state, ); } Err(err) => { @@ -340,6 +365,7 @@ fn handle_guard( tt: &TinyTemplate, sym_expr_info_index: &RefCell, export_failures: &mut Vec, + vllm_state: &vllm::VllmState, ) { let sym_expr_info_index_borrowed = sym_expr_info_index.borrow(); let parser: Box = @@ -357,6 +383,7 @@ fn handle_guard( compile_directory, multi, stats, + vllm_state, ); let filename = format!( @@ -473,6 +500,16 @@ pub fn parse_path(path: &PathBuf, config: &ParseConfig) -> anyhow::Result anyhow::Result = FxHashSet::default(); @@ -520,7 +558,10 @@ pub fn parse_path(path: &PathBuf, config: &ParseConfig) -> anyhow::Result> = default_parsers.iter().collect(); + all_parsers.extend(vllm_parsers.iter()); let mut chromium_events: Vec = Vec::new(); all_parsers.extend(config.custom_parsers.iter()); @@ -757,6 +798,7 @@ pub fn parse_path(path: &PathBuf, config: &ParseConfig) -> anyhow::Result anyhow::Result anyhow::Result{cid} ", @@ -875,6 +919,7 @@ pub fn parse_path(path: &PathBuf, config: &ParseConfig) -> anyhow::Result anyhow::Result anyhow::Result anyhow::Result) -> PathBuf { +pub fn build_file_path(filename: &str, lineno: usize, compile_id: &Option) -> PathBuf { let compile_id_dir: PathBuf = compile_id .as_ref() .map_or(format!("unknown_{lineno}"), |cid| cid.as_directory_name()) @@ -151,6 +151,12 @@ impl StructuredLogParser for GraphDumpParser { "graph_dump" // ToDO: more specific? } fn get_metadata<'e>(&self, e: &'e Envelope) -> Option> { + if let Some(graph_dump) = &e.graph_dump { + if graph_dump.name.starts_with("vllm_") { + // Skip vLLM-specific graph dumps (handled by parsers under src/vllm) + return None; + } + } e.graph_dump.as_ref().map(|m| Metadata::GraphDump(m)) } fn parse<'e>( diff --git a/src/vllm/mod.rs b/src/vllm/mod.rs new file mode 100644 index 0000000..595f965 --- /dev/null +++ b/src/vllm/mod.rs @@ -0,0 +1,11 @@ +//! vLLM-specific parsing and visualization for tlparse. +//! +//! This module provides parsers and templates for vLLM's structured logs, +//! including piecewise compilation, subgraph tracking, and cudagraph captures. + +pub mod parsers; +pub mod templates; +pub mod types; + +pub use parsers::{generate_vllm_summary, vllm_parsers_with_state, VllmState}; +pub use types::VllmSummaryContext; diff --git a/src/vllm/parsers.rs b/src/vllm/parsers.rs new file mode 100644 index 0000000..e06e7f4 --- /dev/null +++ b/src/vllm/parsers.rs @@ -0,0 +1,287 @@ +use crate::parsers::{build_file_path, Metadata, ParserOutput, ParserResults, StructuredLogParser}; +use crate::templates::TEMPLATE_QUERY_PARAM_SCRIPT; +use crate::types::{CompileId, Envelope}; + +use super::types::{ + ArtifactInfo, VllmCompilationConfig, VllmCompileRangeGroup, VllmSubgraphInfo, + VllmSubgraphWithArtifacts, VllmSummaryContext, +}; + +use std::cell::RefCell; +use std::rc::Rc; +use tinytemplate::TinyTemplate; + +#[derive(Debug, Default)] +pub struct VllmState { + pub config: RefCell>, + pub piecewise_graph_file: RefCell>, + pub subgraphs: RefCell>, + pub pre_subgraph_artifacts: RefCell>, + pub has_vllm_artifacts: RefCell, +} + +impl VllmState { + pub fn new() -> Rc { + Rc::new(Self::default()) + } + + pub fn has_artifacts(&self) -> bool { + *self.has_vllm_artifacts.borrow() + } + + // Add artifact to current subgraph, or pre_subgraph_artifacts if no subgraph yet + pub fn add_artifact(&self, filename: &std::path::Path, suffix: String) { + let url = filename.to_string_lossy().to_string(); + let name = filename + .file_stem() + .and_then(|s| s.to_str()) + .map(|s| s.to_string()) + .unwrap_or_else(|| url.clone()); + + // Track piecewise split graph file for linking in summary + if name.starts_with("vllm_piecewise_split_graph") { + *self.piecewise_graph_file.borrow_mut() = Some(url.clone()); + } + + let artifact = ArtifactInfo { name, url, suffix }; + let mut subgraphs = self.subgraphs.borrow_mut(); + if let Some(last) = subgraphs.last_mut() { + last.artifacts.push(artifact); + } else { + self.pre_subgraph_artifacts.borrow_mut().push(artifact); + } + } + + // Group subgraphs by compile range/size for hierarchical display + pub fn build_compile_range_groups(&self) -> Vec { + use indexmap::IndexMap; + + let subgraphs = self.subgraphs.borrow(); + let mut groups: IndexMap> = IndexMap::new(); + + for subgraph in subgraphs.iter() { + let size_or_range = subgraph.size_or_range(); + let artifact_count = subgraph.artifacts.len(); + groups + .entry(size_or_range) + .or_default() + .push(VllmSubgraphWithArtifacts { + submod_name: subgraph.display_submod_name(), + artifacts: subgraph.artifacts.clone(), + artifact_count, + }); + } + + groups + .into_iter() + .map(|(size_or_range, submods)| VllmCompileRangeGroup { + size_or_range, + submod_count: submods.len(), + submods, + }) + .collect() + } + + // Get dynamo artifacts from pre_subgraph_artifacts + pub fn build_dynamo_artifacts(&self) -> Vec { + let dynamo_names = [ + "dynamo_side_effects", + "dynamo_output_graph", + "dynamo_cpp_guards_str", + "compilation_metrics", + ]; + self.pre_subgraph_artifacts + .borrow() + .iter() + .filter(|a| dynamo_names.iter().any(|name| a.name.starts_with(name))) + .cloned() + .collect() + } +} + +// Parses vllm_compilation_config artifacts. +// Stores config in state for display, outputs formatted JSON file. +pub struct VllmCompilationConfigParser { + state: Rc, +} + +impl VllmCompilationConfigParser { + pub fn new(state: Rc) -> Self { + Self { state } + } +} + +impl StructuredLogParser for VllmCompilationConfigParser { + fn name(&self) -> &'static str { + "vllm_compilation_config" + } + + fn get_metadata<'e>(&self, e: &'e Envelope) -> Option> { + if let Some(artifact) = &e.artifact { + if artifact.name == "vllm_compilation_config" { + return Some(Metadata::Artifact(artifact)); + } + } + None + } + + fn parse<'e>( + &self, + lineno: usize, + _metadata: Metadata<'e>, + _rank: Option, + compile_id: &Option, + payload: &str, + ) -> anyhow::Result { + if let Ok(config) = serde_json::from_str::(payload) { + *self.state.config.borrow_mut() = Some(config); + *self.state.has_vllm_artifacts.borrow_mut() = true; + } + + let f = build_file_path("vllm_compilation_config.json", lineno, compile_id); + Ok(vec![ParserOutput::PayloadReformatFile(f, |payload| { + let value: serde_json::Value = serde_json::from_str(payload)?; + Ok(serde_json::to_string_pretty(&value)?) + })]) + } +} + +// Parses vllm_piecewise_compile_start artifacts and vllm_subgraph_*/vllm_submod_* graph dumps. +// On compile_start: pushes new VllmSubgraphInfo to state.subgraphs (subsequent artifacts attach here). +// On graph_dump: adds artifact to current subgraph and outputs the graph file. +pub struct VllmPiecewiseCompileParser { + state: Rc, +} + +impl VllmPiecewiseCompileParser { + pub fn new(state: Rc) -> Self { + Self { state } + } +} + +impl StructuredLogParser for VllmPiecewiseCompileParser { + fn name(&self) -> &'static str { + "vllm_piecewise_compile" + } + + fn get_metadata<'e>(&self, e: &'e Envelope) -> Option> { + if let Some(artifact) = &e.artifact { + if artifact.name == "vllm_piecewise_compile_start" { + return Some(Metadata::Artifact(artifact)); + } + } + if let Some(graph_dump) = &e.graph_dump { + if graph_dump.name.starts_with("vllm_subgraph_") + || graph_dump.name.starts_with("vllm_submod_") + { + return Some(Metadata::GraphDump(graph_dump)); + } + } + None + } + + fn parse<'e>( + &self, + lineno: usize, + metadata: Metadata<'e>, + _rank: Option, + compile_id: &Option, + payload: &str, + ) -> anyhow::Result { + *self.state.has_vllm_artifacts.borrow_mut() = true; + + match metadata { + Metadata::Artifact(_artifact) => { + if let Ok(subgraph) = serde_json::from_str::(payload) { + self.state.subgraphs.borrow_mut().push(subgraph); + } + Ok(Vec::new()) + } + Metadata::GraphDump(graph_dump) => { + let name = &graph_dump.name; + let filename = format!("{}.txt", name); + let f = build_file_path(&filename, lineno, compile_id); + // add_file_output will call add_artifact for us + Ok(vec![ParserOutput::PayloadFile(f)]) + } + _ => Ok(Vec::new()), + } + } +} + +// Parses vllm_piecewise_split_graph graph dumps. +// Stores path in state for linking in summary, outputs the graph file. +pub struct VllmPiecewiseSplitGraphParser { + state: Rc, +} + +impl VllmPiecewiseSplitGraphParser { + pub fn new(state: Rc) -> Self { + Self { state } + } +} + +impl StructuredLogParser for VllmPiecewiseSplitGraphParser { + fn name(&self) -> &'static str { + "vllm_piecewise_split_graph" + } + + fn get_metadata<'e>(&self, e: &'e Envelope) -> Option> { + if let Some(graph_dump) = &e.graph_dump { + if graph_dump.name == "vllm_piecewise_split_graph" { + return Some(Metadata::GraphDump(graph_dump)); + } + } + None + } + + fn parse<'e>( + &self, + lineno: usize, + _metadata: Metadata<'e>, + _rank: Option, + compile_id: &Option, + _payload: &str, + ) -> anyhow::Result { + let filename = "vllm_piecewise_split_graph.txt"; + let f = build_file_path(filename, lineno, compile_id); + *self.state.has_vllm_artifacts.borrow_mut() = true; + Ok(vec![ParserOutput::PayloadFile(f)]) + } +} + +pub fn vllm_parsers_with_state(state: Rc) -> Vec> { + vec![ + Box::new(VllmCompilationConfigParser::new(state.clone())), + Box::new(VllmPiecewiseSplitGraphParser::new(state.clone())), + Box::new(VllmPiecewiseCompileParser::new(state.clone())), + ] +} + +pub fn generate_vllm_summary( + state: &VllmState, + tt: &TinyTemplate, + custom_header_html: &str, +) -> anyhow::Result { + let config = state.config.borrow().clone().unwrap_or_default(); + let dynamo_artifacts = state.build_dynamo_artifacts(); + let has_dynamo_artifacts = !dynamo_artifacts.is_empty(); + let piecewise_graph_file = state.piecewise_graph_file.borrow().clone(); + let has_piecewise = piecewise_graph_file.is_some(); + let compile_range_groups = state.build_compile_range_groups(); + + let context = VllmSummaryContext { + css: super::templates::VLLM_CSS.to_string(), + qps: TEMPLATE_QUERY_PARAM_SCRIPT.to_string(), + custom_header_html: custom_header_html.to_string(), + has_config: state.config.borrow().is_some(), + config, + dynamo_artifacts, + has_dynamo_artifacts, + piecewise_graph_file, + has_piecewise, + compile_range_groups, + }; + + Ok(tt.render("vllm_summary.html", &context)?) +} diff --git a/src/vllm/templates.rs b/src/vllm/templates.rs new file mode 100644 index 0000000..d398b6e --- /dev/null +++ b/src/vllm/templates.rs @@ -0,0 +1,236 @@ +//! HTML templates for vLLM visualization. + +/// CSS for vLLM summary page +pub const VLLM_CSS: &str = r#" +body { + font-family: Arial, sans-serif; + margin: 20px; + background: #f5f5f5; +} +h1 { + color: #333; + border-bottom: 2px solid #4a90d9; + padding-bottom: 10px; +} +h2 { + color: #4a90d9; + margin-top: 30px; +} +h3 { + color: #666; + margin-top: 20px; +} +.config-table { + background: white; + border-collapse: collapse; + margin: 10px 0; + box-shadow: 0 1px 3px rgba(0,0,0,0.1); +} +.config-table td, .config-table th { + padding: 8px 16px; + border: 1px solid #ddd; + text-align: left; +} +.config-table tr:nth-child(even) { + background: #f9f9f9; +} +.compile-range-group { + margin: 20px 0; + padding: 15px; + border-radius: 8px; + background: white; + border: 1px solid #ddd; +} +.compile-range-group h3 { + margin: 0; + padding-bottom: 0; +} +.submods-container { + margin-left: 30px; + margin-top: 15px; + padding-left: 15px; + border-left: 2px solid rgba(0,0,0,0.1); +} +.submods-container > summary { + cursor: pointer; + font-weight: 500; + color: #555; + padding: 5px 0; +} +.submods-container > summary:hover { + color: #4a90d9; +} +.submods-container[open] > summary { + margin-bottom: 10px; + border-bottom: 1px solid rgba(0,0,0,0.1); + padding-bottom: 10px; +} +.subgraph { + background: rgba(255,255,255,0.7); + padding: 12px 12px 12px 20px; + margin: 10px 0 10px 25px; + border-radius: 5px; + border: 1px solid rgba(0,0,0,0.1); +} +.subgraph h4 { + margin: 0 0 8px 0; + color: #333; + font-size: 0.95em; +} +.subgraph ul { + margin: 5px 0; + padding-left: 20px; +} +.subgraph a { + color: #4a90d9; + text-decoration: none; +} +.subgraph a:hover { + text-decoration: underline; +} +.artifact-section { + margin-top: 10px; + padding: 10px; + background: rgba(0, 0, 0, 0.03); + border-radius: 4px; +} +.artifact-section summary { + cursor: pointer; + font-weight: 500; + color: #666; +} +.artifact-section summary:hover { + color: #4a90d9; +} +.artifact-list { + margin: 10px 0 0 0; + padding-left: 20px; + list-style-type: disc; +} +.artifact-list li { + margin: 4px 0; +} +.artifact-list a { + color: #4a90d9; + text-decoration: none; +} +.artifact-list a:hover { + text-decoration: underline; +} +.summary-box { + background: white; + padding: 15px; + margin: 10px 0; + border-radius: 5px; + box-shadow: 0 1px 3px rgba(0,0,0,0.1); +} +.summary-box a { + color: #4a90d9; + text-decoration: none; +} +.summary-box a:hover { + text-decoration: underline; +} +"#; + +pub const VLLM_SUMMARY_TEMPLATE: &str = r#" + + + + vLLM Compilation Summary + + + +{custom_header_html | format_unescaped} +
+ This is the vLLM compilation view. View original tlparse output → +
+

vLLM Compilation Summary

+ + {{ if has_config }} +

Compilation Configuration

+
+ Core Settings + + + + + + + +
Model{config.model}
Mode{config.mode}
Backend{config.backend}
Prefix{config.prefix}
Custom Ops{config.custom_ops}
Splitting Ops{config.splitting_ops}
+
+
+ Compile Settings + + + + + + + + + +
CUDAGraph Mode{config.cudagraph_mode}
Use Inductor Graph Partition{config.use_inductor_graph_partition}
Compile Sizes{config.compile_sizes}
Compile Ranges Split Points{config.compile_ranges_split_points}
Inductor Passes{config.inductor_passes}
Enabled Passes{config.enabled_passes}
Dynamic Shapes Type{config.dynamic_shapes_type}
Dynamic Shapes Evaluate Guards{config.dynamic_shapes_evaluate_guards}
+
+ {{ endif }} + +
+

PT2 generates Chromium Trace Events in JSON on specific events during compilation. + You can download and view them in a tool like Perfetto.

+
+ + {{ if has_dynamo_artifacts }} +

Dynamo Compilation

+
+
    + {{ for artifact in dynamo_artifacts }} +
  • {artifact.name} {artifact.suffix}
  • + {{ endfor }} +
+
+ {{ endif }} + + {{ if has_piecewise }} +

Piecewise Split Graph

+ + {{ endif }} + +

Inductor Compilation

+ + {{ for group in compile_range_groups }} +
+

{group.size_or_range}

+ +
+ Subgraphs ({group.submod_count}) + {{ for subgraph in group.submods }} +
+

{subgraph.submod_name}

+ {{ if subgraph.artifacts }} +
+
+ Artifacts ({subgraph.artifact_count} files) +
    + {{ for artifact in subgraph.artifacts }} +
  • {artifact.name} {artifact.suffix}
  • + {{ endfor }} +
+
+
+ {{ endif }} +
+ {{ endfor }} +
+
+ {{ endfor }} +{qps | format_unescaped} + + +"#; diff --git a/src/vllm/types.rs b/src/vllm/types.rs new file mode 100644 index 0000000..85902c0 --- /dev/null +++ b/src/vllm/types.rs @@ -0,0 +1,88 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Deserialize, Serialize, Clone, Default)] +pub struct VllmCompilationConfig { + pub model: Option, + pub prefix: Option, + pub mode: Option, + pub backend: Option, + pub custom_ops: Option, + pub splitting_ops: Option, + pub cudagraph_mode: Option, + pub compile_sizes: Option, + pub compile_ranges_split_points: Option, + pub use_inductor_graph_partition: Option, + pub inductor_passes: Option, + pub enabled_passes: Option, + pub dynamic_shapes_type: Option, + pub dynamic_shapes_evaluate_guards: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct VllmSubgraphInfo { + #[serde(rename = "piecewise_index")] + pub index: i32, + #[serde(default)] + pub submod_name: Option, + pub compile_range_start: i64, + pub compile_range_end: i64, + pub is_single_size: bool, + #[serde(rename = "is_cudagraph_capture_size")] + pub is_cudagraph_size: bool, + #[serde(skip)] + pub artifacts: Vec, +} + +impl VllmSubgraphInfo { + pub fn size_or_range(&self) -> String { + if self.is_single_size { + format!("size {}", self.compile_range_start) + } else { + format!( + "range [{}, {}]", + self.compile_range_start, self.compile_range_end + ) + } + } + + pub fn display_submod_name(&self) -> String { + self.submod_name + .clone() + .unwrap_or_else(|| format!("subgraph_{}", self.index)) + } +} + +#[derive(Debug, Serialize)] +pub struct VllmSummaryContext { + pub css: String, + pub qps: String, + pub custom_header_html: String, + pub config: VllmCompilationConfig, + pub has_config: bool, + pub dynamo_artifacts: Vec, + pub has_dynamo_artifacts: bool, + pub piecewise_graph_file: Option, + pub has_piecewise: bool, + pub compile_range_groups: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub struct VllmSubgraphWithArtifacts { + pub submod_name: String, + pub artifacts: Vec, + pub artifact_count: usize, +} + +#[derive(Debug, Clone, Serialize)] +pub struct VllmCompileRangeGroup { + pub size_or_range: String, + pub submod_count: usize, + pub submods: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub struct ArtifactInfo { + pub name: String, + pub url: String, + pub suffix: String, +} diff --git a/tests/inputs/vllm_sample.log b/tests/inputs/vllm_sample.log new file mode 100644 index 0000000..f0ef719 --- /dev/null +++ b/tests/inputs/vllm_sample.log @@ -0,0 +1,14680 @@ +V0127 17:17:45.175000 1175001 site-packages/torch/_dynamo/output_graph.py:2190] {"dynamo_output_graph": {"sizes": {"l_input_ids_": ["s72"], "l_self_modules_embed_tokens_parameters_weight_": [64128, 4096], "l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_": [131072, 128], "l_positions_": ["s72"], "l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_": [3072, 4096], "l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_": [4096, 2048], "l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_": [4096], "l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_": [14336, 4096], "l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_": [4096, 7168], "l_self_modules_norm_parameters_weight_": [4096], "ge": ["s72"], "lt": ["s72"], "org_vocab_mask": ["s72"], "ge_1": ["s72"], "lt_1": ["s72"], "added_vocab_mask": ["s72"], "mul": ["s72"], "mul_1": ["s72"], "valid_offset": ["s72"], "vocab_mask": ["s72"], "sub": ["s72"], "input_": ["s72"], "input_mask": ["s72"], "long": ["s72"], "output_parallel": ["s72", 4096], "unsqueeze": ["s72", 1], "masked_fill_": ["s72", 4096], "output": ["s72", 4096], "_get_data_attr": [4096], "x": ["s72", 4096], "pow_1": ["s72", 4096], "variance": ["s72", 1], "add_1": ["s72", 1], "rsqrt": ["s72", 1], "x_1": ["s72", 4096], "x_2": ["s72", 4096], "x_3": ["s72", 4096], "output_parallel_1": ["s72", 3072], "q": ["s72", 2048], "k": ["s72", 512], "v": ["s72", 512], "positions": ["s72"], "cos_sin": ["s72", 128], "cos": ["s72", 64], "sin": ["s72", 64], "query": ["s72", 16, 128], "query_rot": ["s72", 16, 128], "query_pass": ["s72", 16, 0], "unsqueeze_1": ["s72", 1, 64], "cos_1": ["s72", 1, 64], "unsqueeze_2": ["s72", 1, 64], "sin_1": ["s72", 1, 64], "x1": ["s72", 16, 64], "x2": ["s72", 16, 64], "mul_5": ["s72", 16, 64], "mul_6": ["s72", 16, 64], "o1": ["s72", 16, 64], "mul_7": ["s72", 16, 64], "mul_8": ["s72", 16, 64], "o2": ["s72", 16, 64], "output_1": ["s72", 16, 128], "cat_1": ["s72", 16, 128], "query_1": ["s72", 2048], "key": ["s72", 4, 128], "key_rot": ["s72", 4, 128], "key_pass": ["s72", 4, 0], "unsqueeze_3": ["s72", 1, 64], "cos_2": ["s72", 1, 64], "unsqueeze_4": ["s72", 1, 64], "sin_2": ["s72", 1, 64], "x1_1": ["s72", 4, 64], "x2_1": ["s72", 4, 64], "mul_9": ["s72", 4, 64], "mul_10": ["s72", 4, 64], "o1_1": ["s72", 4, 64], "mul_11": ["s72", 4, 64], "mul_12": ["s72", 4, 64], "o2_1": ["s72", 4, 64], "output_2": ["s72", 4, 128], "cat_3": ["s72", 4, 128], "key_1": ["s72", 512], "output_3": ["s72", 2048], "query_2": ["s72", 16, 128], "output_4": ["s72", 16, 128], "key_2": ["s72", 4, 128], "value": ["s72", 4, 128], "attn_output": ["s72", 2048], "output_parallel_2": ["s72", 4096], "output_5": ["s72", 4096], "_get_data_attr_1": [4096], "x_4": ["s72", 4096], "x_5": ["s72", 4096], "residual": ["s72", 4096], "pow_2": ["s72", 4096], "variance_1": ["s72", 1], "add_5": ["s72", 1], "rsqrt_1": ["s72", 1], "x_6": ["s72", 4096], "x_7": ["s72", 4096], "x_8": ["s72", 4096], "output_parallel_3": ["s72", 14336], "getitem_26": ["s72", 7168], "silu": ["s72", 7168], "getitem_27": ["s72", 7168], "x_9": ["s72", 7168], "output_parallel_4": ["s72", 4096], "output_6": ["s72", 4096], "_get_data_attr_2": [4096], "x_10": ["s72", 4096], "x_11": ["s72", 4096], "residual_1": ["s72", 4096], "pow_3": ["s72", 4096], "variance_2": ["s72", 1], "add_7": ["s72", 1], "rsqrt_2": ["s72", 1], "x_12": ["s72", 4096], "x_13": ["s72", 4096], "x_14": ["s72", 4096], "output_parallel_5": ["s72", 3072], "q_1": ["s72", 2048], "k_1": ["s72", 512], "v_1": ["s72", 512], "positions_1": ["s72"], "cos_sin_1": ["s72", 128], "cos_3": ["s72", 64], "sin_3": ["s72", 64], "query_3": ["s72", 16, 128], "query_rot_1": ["s72", 16, 128], "query_pass_1": ["s72", 16, 0], "unsqueeze_5": ["s72", 1, 64], "cos_4": ["s72", 1, 64], "unsqueeze_6": ["s72", 1, 64], "sin_4": ["s72", 1, 64], "x1_2": ["s72", 16, 64], "x2_2": ["s72", 16, 64], "mul_18": ["s72", 16, 64], "mul_19": ["s72", 16, 64], "o1_2": ["s72", 16, 64], "mul_20": ["s72", 16, 64], "mul_21": ["s72", 16, 64], "o2_2": ["s72", 16, 64], "output_7": ["s72", 16, 128], "cat_5": ["s72", 16, 128], "query_4": ["s72", 2048], "key_3": ["s72", 4, 128], "key_rot_1": ["s72", 4, 128], "key_pass_1": ["s72", 4, 0], "unsqueeze_7": ["s72", 1, 64], "cos_5": ["s72", 1, 64], "unsqueeze_8": ["s72", 1, 64], "sin_5": ["s72", 1, 64], "x1_3": ["s72", 4, 64], "x2_3": ["s72", 4, 64], "mul_22": ["s72", 4, 64], "mul_23": ["s72", 4, 64], "o1_3": ["s72", 4, 64], "mul_24": ["s72", 4, 64], "mul_25": ["s72", 4, 64], "o2_3": ["s72", 4, 64], "output_8": ["s72", 4, 128], "cat_7": ["s72", 4, 128], "key_4": ["s72", 512], "output_9": ["s72", 2048], "query_5": ["s72", 16, 128], "output_10": ["s72", 16, 128], "key_5": ["s72", 4, 128], "value_1": ["s72", 4, 128], "attn_output_1": ["s72", 2048], "output_parallel_6": ["s72", 4096], "output_11": ["s72", 4096], "_get_data_attr_3": [4096], "x_15": ["s72", 4096], "x_16": ["s72", 4096], "residual_2": ["s72", 4096], "pow_4": ["s72", 4096], "variance_3": ["s72", 1], "add_11": ["s72", 1], "rsqrt_3": ["s72", 1], "x_17": ["s72", 4096], "x_18": ["s72", 4096], "x_19": ["s72", 4096], "output_parallel_7": ["s72", 14336], "getitem_54": ["s72", 7168], "silu_1": ["s72", 7168], "getitem_55": ["s72", 7168], "x_20": ["s72", 7168], "output_parallel_8": ["s72", 4096], "output_12": ["s72", 4096], "_get_data_attr_4": [4096], "x_21": ["s72", 4096], "x_22": ["s72", 4096], "residual_3": ["s72", 4096], "pow_5": ["s72", 4096], "variance_4": ["s72", 1], "add_13": ["s72", 1], "rsqrt_4": ["s72", 1], "x_23": ["s72", 4096], "x_24": ["s72", 4096], "x_25": ["s72", 4096], "output_parallel_9": ["s72", 3072], "q_2": ["s72", 2048], "k_2": ["s72", 512], "v_2": ["s72", 512], "positions_2": ["s72"], "cos_sin_2": ["s72", 128], "cos_6": ["s72", 64], "sin_6": ["s72", 64], "query_6": ["s72", 16, 128], "query_rot_2": ["s72", 16, 128], "query_pass_2": ["s72", 16, 0], "unsqueeze_9": ["s72", 1, 64], "cos_7": ["s72", 1, 64], "unsqueeze_10": ["s72", 1, 64], "sin_7": ["s72", 1, 64], "x1_4": ["s72", 16, 64], "x2_4": ["s72", 16, 64], "mul_31": ["s72", 16, 64], "mul_32": ["s72", 16, 64], "o1_4": ["s72", 16, 64], "mul_33": ["s72", 16, 64], "mul_34": ["s72", 16, 64], "o2_4": ["s72", 16, 64], "output_13": ["s72", 16, 128], "cat_9": ["s72", 16, 128], "query_7": ["s72", 2048], "key_6": ["s72", 4, 128], "key_rot_2": ["s72", 4, 128], "key_pass_2": ["s72", 4, 0], "unsqueeze_11": ["s72", 1, 64], "cos_8": ["s72", 1, 64], "unsqueeze_12": ["s72", 1, 64], "sin_8": ["s72", 1, 64], "x1_5": ["s72", 4, 64], "x2_5": ["s72", 4, 64], "mul_35": ["s72", 4, 64], "mul_36": ["s72", 4, 64], "o1_5": ["s72", 4, 64], "mul_37": ["s72", 4, 64], "mul_38": ["s72", 4, 64], "o2_5": ["s72", 4, 64], "output_14": ["s72", 4, 128], "cat_11": ["s72", 4, 128], "key_7": ["s72", 512], "output_15": ["s72", 2048], "query_8": ["s72", 16, 128], "output_16": ["s72", 16, 128], "key_8": ["s72", 4, 128], "value_2": ["s72", 4, 128], "attn_output_2": ["s72", 2048], "output_parallel_10": ["s72", 4096], "output_17": ["s72", 4096], "_get_data_attr_5": [4096], "x_26": ["s72", 4096], "x_27": ["s72", 4096], "residual_4": ["s72", 4096], "pow_6": ["s72", 4096], "variance_5": ["s72", 1], "add_17": ["s72", 1], "rsqrt_5": ["s72", 1], "x_28": ["s72", 4096], "x_29": ["s72", 4096], "x_30": ["s72", 4096], "output_parallel_11": ["s72", 14336], "getitem_82": ["s72", 7168], "silu_2": ["s72", 7168], "getitem_83": ["s72", 7168], "x_31": ["s72", 7168], "output_parallel_12": ["s72", 4096], "output_18": ["s72", 4096], "_get_data_attr_6": [4096], "x_32": ["s72", 4096], "x_33": ["s72", 4096], "residual_5": ["s72", 4096], "pow_7": ["s72", 4096], "variance_6": ["s72", 1], "add_19": ["s72", 1], "rsqrt_6": ["s72", 1], "x_34": ["s72", 4096], "x_35": ["s72", 4096], "x_36": ["s72", 4096], "output_parallel_13": ["s72", 3072], "q_3": ["s72", 2048], "k_3": ["s72", 512], "v_3": ["s72", 512], "positions_3": ["s72"], "cos_sin_3": ["s72", 128], "cos_9": ["s72", 64], "sin_9": ["s72", 64], "query_9": ["s72", 16, 128], "query_rot_3": ["s72", 16, 128], "query_pass_3": ["s72", 16, 0], "unsqueeze_13": ["s72", 1, 64], "cos_10": ["s72", 1, 64], "unsqueeze_14": ["s72", 1, 64], "sin_10": ["s72", 1, 64], "x1_6": ["s72", 16, 64], "x2_6": ["s72", 16, 64], "mul_44": ["s72", 16, 64], "mul_45": ["s72", 16, 64], "o1_6": ["s72", 16, 64], "mul_46": ["s72", 16, 64], "mul_47": ["s72", 16, 64], "o2_6": ["s72", 16, 64], "output_19": ["s72", 16, 128], "cat_13": ["s72", 16, 128], "query_10": ["s72", 2048], "key_9": ["s72", 4, 128], "key_rot_3": ["s72", 4, 128], "key_pass_3": ["s72", 4, 0], "unsqueeze_15": ["s72", 1, 64], "cos_11": ["s72", 1, 64], "unsqueeze_16": ["s72", 1, 64], "sin_11": ["s72", 1, 64], "x1_7": ["s72", 4, 64], "x2_7": ["s72", 4, 64], "mul_48": ["s72", 4, 64], "mul_49": ["s72", 4, 64], "o1_7": ["s72", 4, 64], "mul_50": ["s72", 4, 64], "mul_51": ["s72", 4, 64], "o2_7": ["s72", 4, 64], "output_20": ["s72", 4, 128], "cat_15": ["s72", 4, 128], "key_10": ["s72", 512], "output_21": ["s72", 2048], "query_11": ["s72", 16, 128], "output_22": ["s72", 16, 128], "key_11": ["s72", 4, 128], "value_3": ["s72", 4, 128], "attn_output_3": ["s72", 2048], "output_parallel_14": ["s72", 4096], "output_23": ["s72", 4096], "_get_data_attr_7": [4096], "x_37": ["s72", 4096], "x_38": ["s72", 4096], "residual_6": ["s72", 4096], "pow_8": ["s72", 4096], "variance_7": ["s72", 1], "add_23": ["s72", 1], "rsqrt_7": ["s72", 1], "x_39": ["s72", 4096], "x_40": ["s72", 4096], "x_41": ["s72", 4096], "output_parallel_15": ["s72", 14336], "getitem_110": ["s72", 7168], "silu_3": ["s72", 7168], "getitem_111": ["s72", 7168], "x_42": ["s72", 7168], "output_parallel_16": ["s72", 4096], "output_24": ["s72", 4096], "_get_data_attr_8": [4096], "x_43": ["s72", 4096], "x_44": ["s72", 4096], "residual_7": ["s72", 4096], "pow_9": ["s72", 4096], "variance_8": ["s72", 1], "add_25": ["s72", 1], "rsqrt_8": ["s72", 1], "x_45": ["s72", 4096], "x_46": ["s72", 4096], "x_47": ["s72", 4096], "output_parallel_17": ["s72", 3072], "q_4": ["s72", 2048], "k_4": ["s72", 512], "v_4": ["s72", 512], "positions_4": ["s72"], "cos_sin_4": ["s72", 128], "cos_12": ["s72", 64], "sin_12": ["s72", 64], "query_12": ["s72", 16, 128], "query_rot_4": ["s72", 16, 128], "query_pass_4": ["s72", 16, 0], "unsqueeze_17": ["s72", 1, 64], "cos_13": ["s72", 1, 64], "unsqueeze_18": ["s72", 1, 64], "sin_13": ["s72", 1, 64], "x1_8": ["s72", 16, 64], "x2_8": ["s72", 16, 64], "mul_57": ["s72", 16, 64], "mul_58": ["s72", 16, 64], "o1_8": ["s72", 16, 64], "mul_59": ["s72", 16, 64], "mul_60": ["s72", 16, 64], "o2_8": ["s72", 16, 64], "output_25": ["s72", 16, 128], "cat_17": ["s72", 16, 128], "query_13": ["s72", 2048], "key_12": ["s72", 4, 128], "key_rot_4": ["s72", 4, 128], "key_pass_4": ["s72", 4, 0], "unsqueeze_19": ["s72", 1, 64], "cos_14": ["s72", 1, 64], "unsqueeze_20": ["s72", 1, 64], "sin_14": ["s72", 1, 64], "x1_9": ["s72", 4, 64], "x2_9": ["s72", 4, 64], "mul_61": ["s72", 4, 64], "mul_62": ["s72", 4, 64], "o1_9": ["s72", 4, 64], "mul_63": ["s72", 4, 64], "mul_64": ["s72", 4, 64], "o2_9": ["s72", 4, 64], "output_26": ["s72", 4, 128], "cat_19": ["s72", 4, 128], "key_13": ["s72", 512], "output_27": ["s72", 2048], "query_14": ["s72", 16, 128], "output_28": ["s72", 16, 128], "key_14": ["s72", 4, 128], "value_4": ["s72", 4, 128], "attn_output_4": ["s72", 2048], "output_parallel_18": ["s72", 4096], "output_29": ["s72", 4096], "_get_data_attr_9": [4096], "x_48": ["s72", 4096], "x_49": ["s72", 4096], "residual_8": ["s72", 4096], "pow_10": ["s72", 4096], "variance_9": ["s72", 1], "add_29": ["s72", 1], "rsqrt_9": ["s72", 1], "x_50": ["s72", 4096], "x_51": ["s72", 4096], "x_52": ["s72", 4096], "output_parallel_19": ["s72", 14336], "getitem_138": ["s72", 7168], "silu_4": ["s72", 7168], "getitem_139": ["s72", 7168], "x_53": ["s72", 7168], "output_parallel_20": ["s72", 4096], "output_30": ["s72", 4096], "_get_data_attr_10": [4096], "x_54": ["s72", 4096], "x_55": ["s72", 4096], "residual_9": ["s72", 4096], "pow_11": ["s72", 4096], "variance_10": ["s72", 1], "add_31": ["s72", 1], "rsqrt_10": ["s72", 1], "x_56": ["s72", 4096], "x_57": ["s72", 4096], "x_58": ["s72", 4096], "output_parallel_21": ["s72", 3072], "q_5": ["s72", 2048], "k_5": ["s72", 512], "v_5": ["s72", 512], "positions_5": ["s72"], "cos_sin_5": ["s72", 128], "cos_15": ["s72", 64], "sin_15": ["s72", 64], "query_15": ["s72", 16, 128], "query_rot_5": ["s72", 16, 128], "query_pass_5": ["s72", 16, 0], "unsqueeze_21": ["s72", 1, 64], "cos_16": ["s72", 1, 64], "unsqueeze_22": ["s72", 1, 64], "sin_16": ["s72", 1, 64], "x1_10": ["s72", 16, 64], "x2_10": ["s72", 16, 64], "mul_70": ["s72", 16, 64], "mul_71": ["s72", 16, 64], "o1_10": ["s72", 16, 64], "mul_72": ["s72", 16, 64], "mul_73": ["s72", 16, 64], "o2_10": ["s72", 16, 64], "output_31": ["s72", 16, 128], "cat_21": ["s72", 16, 128], "query_16": ["s72", 2048], "key_15": ["s72", 4, 128], "key_rot_5": ["s72", 4, 128], "key_pass_5": ["s72", 4, 0], "unsqueeze_23": ["s72", 1, 64], "cos_17": ["s72", 1, 64], "unsqueeze_24": ["s72", 1, 64], "sin_17": ["s72", 1, 64], "x1_11": ["s72", 4, 64], "x2_11": ["s72", 4, 64], "mul_74": ["s72", 4, 64], "mul_75": ["s72", 4, 64], "o1_11": ["s72", 4, 64], "mul_76": ["s72", 4, 64], "mul_77": ["s72", 4, 64], "o2_11": ["s72", 4, 64], "output_32": ["s72", 4, 128], "cat_23": ["s72", 4, 128], "key_16": ["s72", 512], "output_33": ["s72", 2048], "query_17": ["s72", 16, 128], "output_34": ["s72", 16, 128], "key_17": ["s72", 4, 128], "value_5": ["s72", 4, 128], "attn_output_5": ["s72", 2048], "output_parallel_22": ["s72", 4096], "output_35": ["s72", 4096], "_get_data_attr_11": [4096], "x_59": ["s72", 4096], "x_60": ["s72", 4096], "residual_10": ["s72", 4096], "pow_12": ["s72", 4096], "variance_11": ["s72", 1], "add_35": ["s72", 1], "rsqrt_11": ["s72", 1], "x_61": ["s72", 4096], "x_62": ["s72", 4096], "x_63": ["s72", 4096], "output_parallel_23": ["s72", 14336], "getitem_166": ["s72", 7168], "silu_5": ["s72", 7168], "getitem_167": ["s72", 7168], "x_64": ["s72", 7168], "output_parallel_24": ["s72", 4096], "output_36": ["s72", 4096], "_get_data_attr_12": [4096], "x_65": ["s72", 4096], "x_66": ["s72", 4096], "residual_11": ["s72", 4096], "pow_13": ["s72", 4096], "variance_12": ["s72", 1], "add_37": ["s72", 1], "rsqrt_12": ["s72", 1], "x_67": ["s72", 4096], "x_68": ["s72", 4096], "x_69": ["s72", 4096], "output_parallel_25": ["s72", 3072], "q_6": ["s72", 2048], "k_6": ["s72", 512], "v_6": ["s72", 512], "positions_6": ["s72"], "cos_sin_6": ["s72", 128], "cos_18": ["s72", 64], "sin_18": ["s72", 64], "query_18": ["s72", 16, 128], "query_rot_6": ["s72", 16, 128], "query_pass_6": ["s72", 16, 0], "unsqueeze_25": ["s72", 1, 64], "cos_19": ["s72", 1, 64], "unsqueeze_26": ["s72", 1, 64], "sin_19": ["s72", 1, 64], "x1_12": ["s72", 16, 64], "x2_12": ["s72", 16, 64], "mul_83": ["s72", 16, 64], "mul_84": ["s72", 16, 64], "o1_12": ["s72", 16, 64], "mul_85": ["s72", 16, 64], "mul_86": ["s72", 16, 64], "o2_12": ["s72", 16, 64], "output_37": ["s72", 16, 128], "cat_25": ["s72", 16, 128], "query_19": ["s72", 2048], "key_18": ["s72", 4, 128], "key_rot_6": ["s72", 4, 128], "key_pass_6": ["s72", 4, 0], "unsqueeze_27": ["s72", 1, 64], "cos_20": ["s72", 1, 64], "unsqueeze_28": ["s72", 1, 64], "sin_20": ["s72", 1, 64], "x1_13": ["s72", 4, 64], "x2_13": ["s72", 4, 64], "mul_87": ["s72", 4, 64], "mul_88": ["s72", 4, 64], "o1_13": ["s72", 4, 64], "mul_89": ["s72", 4, 64], "mul_90": ["s72", 4, 64], "o2_13": ["s72", 4, 64], "output_38": ["s72", 4, 128], "cat_27": ["s72", 4, 128], "key_19": ["s72", 512], "output_39": ["s72", 2048], "query_20": ["s72", 16, 128], "output_40": ["s72", 16, 128], "key_20": ["s72", 4, 128], "value_6": ["s72", 4, 128], "attn_output_6": ["s72", 2048], "output_parallel_26": ["s72", 4096], "output_41": ["s72", 4096], "_get_data_attr_13": [4096], "x_70": ["s72", 4096], "x_71": ["s72", 4096], "residual_12": ["s72", 4096], "pow_14": ["s72", 4096], "variance_13": ["s72", 1], "add_41": ["s72", 1], "rsqrt_13": ["s72", 1], "x_72": ["s72", 4096], "x_73": ["s72", 4096], "x_74": ["s72", 4096], "output_parallel_27": ["s72", 14336], "getitem_194": ["s72", 7168], "silu_6": ["s72", 7168], "getitem_195": ["s72", 7168], "x_75": ["s72", 7168], "output_parallel_28": ["s72", 4096], "output_42": ["s72", 4096], "_get_data_attr_14": [4096], "x_76": ["s72", 4096], "x_77": ["s72", 4096], "residual_13": ["s72", 4096], "pow_15": ["s72", 4096], "variance_14": ["s72", 1], "add_43": ["s72", 1], "rsqrt_14": ["s72", 1], "x_78": ["s72", 4096], "x_79": ["s72", 4096], "x_80": ["s72", 4096], "output_parallel_29": ["s72", 3072], "q_7": ["s72", 2048], "k_7": ["s72", 512], "v_7": ["s72", 512], "positions_7": ["s72"], "cos_sin_7": ["s72", 128], "cos_21": ["s72", 64], "sin_21": ["s72", 64], "query_21": ["s72", 16, 128], "query_rot_7": ["s72", 16, 128], "query_pass_7": ["s72", 16, 0], "unsqueeze_29": ["s72", 1, 64], "cos_22": ["s72", 1, 64], "unsqueeze_30": ["s72", 1, 64], "sin_22": ["s72", 1, 64], "x1_14": ["s72", 16, 64], "x2_14": ["s72", 16, 64], "mul_96": ["s72", 16, 64], "mul_97": ["s72", 16, 64], "o1_14": ["s72", 16, 64], "mul_98": ["s72", 16, 64], "mul_99": ["s72", 16, 64], "o2_14": ["s72", 16, 64], "output_43": ["s72", 16, 128], "cat_29": ["s72", 16, 128], "query_22": ["s72", 2048], "key_21": ["s72", 4, 128], "key_rot_7": ["s72", 4, 128], "key_pass_7": ["s72", 4, 0], "unsqueeze_31": ["s72", 1, 64], "cos_23": ["s72", 1, 64], "unsqueeze_32": ["s72", 1, 64], "sin_23": ["s72", 1, 64], "x1_15": ["s72", 4, 64], "x2_15": ["s72", 4, 64], "mul_100": ["s72", 4, 64], "mul_101": ["s72", 4, 64], "o1_15": ["s72", 4, 64], "mul_102": ["s72", 4, 64], "mul_103": ["s72", 4, 64], "o2_15": ["s72", 4, 64], "output_44": ["s72", 4, 128], "cat_31": ["s72", 4, 128], "key_22": ["s72", 512], "output_45": ["s72", 2048], "query_23": ["s72", 16, 128], "output_46": ["s72", 16, 128], "key_23": ["s72", 4, 128], "value_7": ["s72", 4, 128], "attn_output_7": ["s72", 2048], "output_parallel_30": ["s72", 4096], "output_47": ["s72", 4096], "_get_data_attr_15": [4096], "x_81": ["s72", 4096], "x_82": ["s72", 4096], "residual_14": ["s72", 4096], "pow_16": ["s72", 4096], "variance_15": ["s72", 1], "add_47": ["s72", 1], "rsqrt_15": ["s72", 1], "x_83": ["s72", 4096], "x_84": ["s72", 4096], "x_85": ["s72", 4096], "output_parallel_31": ["s72", 14336], "getitem_222": ["s72", 7168], "silu_7": ["s72", 7168], "getitem_223": ["s72", 7168], "x_86": ["s72", 7168], "output_parallel_32": ["s72", 4096], "output_48": ["s72", 4096], "_get_data_attr_16": [4096], "x_87": ["s72", 4096], "x_88": ["s72", 4096], "residual_15": ["s72", 4096], "pow_17": ["s72", 4096], "variance_16": ["s72", 1], "add_49": ["s72", 1], "rsqrt_16": ["s72", 1], "x_89": ["s72", 4096], "x_90": ["s72", 4096], "x_91": ["s72", 4096], "output_parallel_33": ["s72", 3072], "q_8": ["s72", 2048], "k_8": ["s72", 512], "v_8": ["s72", 512], "positions_8": ["s72"], "cos_sin_8": ["s72", 128], "cos_24": ["s72", 64], "sin_24": ["s72", 64], "query_24": ["s72", 16, 128], "query_rot_8": ["s72", 16, 128], "query_pass_8": ["s72", 16, 0], "unsqueeze_33": ["s72", 1, 64], "cos_25": ["s72", 1, 64], "unsqueeze_34": ["s72", 1, 64], "sin_25": ["s72", 1, 64], "x1_16": ["s72", 16, 64], "x2_16": ["s72", 16, 64], "mul_109": ["s72", 16, 64], "mul_110": ["s72", 16, 64], "o1_16": ["s72", 16, 64], "mul_111": ["s72", 16, 64], "mul_112": ["s72", 16, 64], "o2_16": ["s72", 16, 64], "output_49": ["s72", 16, 128], "cat_33": ["s72", 16, 128], "query_25": ["s72", 2048], "key_24": ["s72", 4, 128], "key_rot_8": ["s72", 4, 128], "key_pass_8": ["s72", 4, 0], "unsqueeze_35": ["s72", 1, 64], "cos_26": ["s72", 1, 64], "unsqueeze_36": ["s72", 1, 64], "sin_26": ["s72", 1, 64], "x1_17": ["s72", 4, 64], "x2_17": ["s72", 4, 64], "mul_113": ["s72", 4, 64], "mul_114": ["s72", 4, 64], "o1_17": ["s72", 4, 64], "mul_115": ["s72", 4, 64], "mul_116": ["s72", 4, 64], "o2_17": ["s72", 4, 64], "output_50": ["s72", 4, 128], "cat_35": ["s72", 4, 128], "key_25": ["s72", 512], "output_51": ["s72", 2048], "query_26": ["s72", 16, 128], "output_52": ["s72", 16, 128], "key_26": ["s72", 4, 128], "value_8": ["s72", 4, 128], "attn_output_8": ["s72", 2048], "output_parallel_34": ["s72", 4096], "output_53": ["s72", 4096], "_get_data_attr_17": [4096], "x_92": ["s72", 4096], "x_93": ["s72", 4096], "residual_16": ["s72", 4096], "pow_18": ["s72", 4096], "variance_17": ["s72", 1], "add_53": ["s72", 1], "rsqrt_17": ["s72", 1], "x_94": ["s72", 4096], "x_95": ["s72", 4096], "x_96": ["s72", 4096], "output_parallel_35": ["s72", 14336], "getitem_250": ["s72", 7168], "silu_8": ["s72", 7168], "getitem_251": ["s72", 7168], "x_97": ["s72", 7168], "output_parallel_36": ["s72", 4096], "output_54": ["s72", 4096], "_get_data_attr_18": [4096], "x_98": ["s72", 4096], "x_99": ["s72", 4096], "residual_17": ["s72", 4096], "pow_19": ["s72", 4096], "variance_18": ["s72", 1], "add_55": ["s72", 1], "rsqrt_18": ["s72", 1], "x_100": ["s72", 4096], "x_101": ["s72", 4096], "x_102": ["s72", 4096], "output_parallel_37": ["s72", 3072], "q_9": ["s72", 2048], "k_9": ["s72", 512], "v_9": ["s72", 512], "positions_9": ["s72"], "cos_sin_9": ["s72", 128], "cos_27": ["s72", 64], "sin_27": ["s72", 64], "query_27": ["s72", 16, 128], "query_rot_9": ["s72", 16, 128], "query_pass_9": ["s72", 16, 0], "unsqueeze_37": ["s72", 1, 64], "cos_28": ["s72", 1, 64], "unsqueeze_38": ["s72", 1, 64], "sin_28": ["s72", 1, 64], "x1_18": ["s72", 16, 64], "x2_18": ["s72", 16, 64], "mul_122": ["s72", 16, 64], "mul_123": ["s72", 16, 64], "o1_18": ["s72", 16, 64], "mul_124": ["s72", 16, 64], "mul_125": ["s72", 16, 64], "o2_18": ["s72", 16, 64], "output_55": ["s72", 16, 128], "cat_37": ["s72", 16, 128], "query_28": ["s72", 2048], "key_27": ["s72", 4, 128], "key_rot_9": ["s72", 4, 128], "key_pass_9": ["s72", 4, 0], "unsqueeze_39": ["s72", 1, 64], "cos_29": ["s72", 1, 64], "unsqueeze_40": ["s72", 1, 64], "sin_29": ["s72", 1, 64], "x1_19": ["s72", 4, 64], "x2_19": ["s72", 4, 64], "mul_126": ["s72", 4, 64], "mul_127": ["s72", 4, 64], "o1_19": ["s72", 4, 64], "mul_128": ["s72", 4, 64], "mul_129": ["s72", 4, 64], "o2_19": ["s72", 4, 64], "output_56": ["s72", 4, 128], "cat_39": ["s72", 4, 128], "key_28": ["s72", 512], "output_57": ["s72", 2048], "query_29": ["s72", 16, 128], "output_58": ["s72", 16, 128], "key_29": ["s72", 4, 128], "value_9": ["s72", 4, 128], "attn_output_9": ["s72", 2048], "output_parallel_38": ["s72", 4096], "output_59": ["s72", 4096], "_get_data_attr_19": [4096], "x_103": ["s72", 4096], "x_104": ["s72", 4096], "residual_18": ["s72", 4096], "pow_20": ["s72", 4096], "variance_19": ["s72", 1], "add_59": ["s72", 1], "rsqrt_19": ["s72", 1], "x_105": ["s72", 4096], "x_106": ["s72", 4096], "x_107": ["s72", 4096], "output_parallel_39": ["s72", 14336], "getitem_278": ["s72", 7168], "silu_9": ["s72", 7168], "getitem_279": ["s72", 7168], "x_108": ["s72", 7168], "output_parallel_40": ["s72", 4096], "output_60": ["s72", 4096], "_get_data_attr_20": [4096], "x_109": ["s72", 4096], "x_110": ["s72", 4096], "residual_19": ["s72", 4096], "pow_21": ["s72", 4096], "variance_20": ["s72", 1], "add_61": ["s72", 1], "rsqrt_20": ["s72", 1], "x_111": ["s72", 4096], "x_112": ["s72", 4096], "x_113": ["s72", 4096], "output_parallel_41": ["s72", 3072], "q_10": ["s72", 2048], "k_10": ["s72", 512], "v_10": ["s72", 512], "positions_10": ["s72"], "cos_sin_10": ["s72", 128], "cos_30": ["s72", 64], "sin_30": ["s72", 64], "query_30": ["s72", 16, 128], "query_rot_10": ["s72", 16, 128], "query_pass_10": ["s72", 16, 0], "unsqueeze_41": ["s72", 1, 64], "cos_31": ["s72", 1, 64], "unsqueeze_42": ["s72", 1, 64], "sin_31": ["s72", 1, 64], "x1_20": ["s72", 16, 64], "x2_20": ["s72", 16, 64], "mul_135": ["s72", 16, 64], "mul_136": ["s72", 16, 64], "o1_20": ["s72", 16, 64], "mul_137": ["s72", 16, 64], "mul_138": ["s72", 16, 64], "o2_20": ["s72", 16, 64], "output_61": ["s72", 16, 128], "cat_41": ["s72", 16, 128], "query_31": ["s72", 2048], "key_30": ["s72", 4, 128], "key_rot_10": ["s72", 4, 128], "key_pass_10": ["s72", 4, 0], "unsqueeze_43": ["s72", 1, 64], "cos_32": ["s72", 1, 64], "unsqueeze_44": ["s72", 1, 64], "sin_32": ["s72", 1, 64], "x1_21": ["s72", 4, 64], "x2_21": ["s72", 4, 64], "mul_139": ["s72", 4, 64], "mul_140": ["s72", 4, 64], "o1_21": ["s72", 4, 64], "mul_141": ["s72", 4, 64], "mul_142": ["s72", 4, 64], "o2_21": ["s72", 4, 64], "output_62": ["s72", 4, 128], "cat_43": ["s72", 4, 128], "key_31": ["s72", 512], "output_63": ["s72", 2048], "query_32": ["s72", 16, 128], "output_64": ["s72", 16, 128], "key_32": ["s72", 4, 128], "value_10": ["s72", 4, 128], "attn_output_10": ["s72", 2048], "output_parallel_42": ["s72", 4096], "output_65": ["s72", 4096], "_get_data_attr_21": [4096], "x_114": ["s72", 4096], "x_115": ["s72", 4096], "residual_20": ["s72", 4096], "pow_22": ["s72", 4096], "variance_21": ["s72", 1], "add_65": ["s72", 1], "rsqrt_21": ["s72", 1], "x_116": ["s72", 4096], "x_117": ["s72", 4096], "x_118": ["s72", 4096], "output_parallel_43": ["s72", 14336], "getitem_306": ["s72", 7168], "silu_10": ["s72", 7168], "getitem_307": ["s72", 7168], "x_119": ["s72", 7168], "output_parallel_44": ["s72", 4096], "output_66": ["s72", 4096], "_get_data_attr_22": [4096], "x_120": ["s72", 4096], "x_121": ["s72", 4096], "residual_21": ["s72", 4096], "pow_23": ["s72", 4096], "variance_22": ["s72", 1], "add_67": ["s72", 1], "rsqrt_22": ["s72", 1], "x_122": ["s72", 4096], "x_123": ["s72", 4096], "x_124": ["s72", 4096], "output_parallel_45": ["s72", 3072], "q_11": ["s72", 2048], "k_11": ["s72", 512], "v_11": ["s72", 512], "positions_11": ["s72"], "cos_sin_11": ["s72", 128], "cos_33": ["s72", 64], "sin_33": ["s72", 64], "query_33": ["s72", 16, 128], "query_rot_11": ["s72", 16, 128], "query_pass_11": ["s72", 16, 0], "unsqueeze_45": ["s72", 1, 64], "cos_34": ["s72", 1, 64], "unsqueeze_46": ["s72", 1, 64], "sin_34": ["s72", 1, 64], "x1_22": ["s72", 16, 64], "x2_22": ["s72", 16, 64], "mul_148": ["s72", 16, 64], "mul_149": ["s72", 16, 64], "o1_22": ["s72", 16, 64], "mul_150": ["s72", 16, 64], "mul_151": ["s72", 16, 64], "o2_22": ["s72", 16, 64], "output_67": ["s72", 16, 128], "cat_45": ["s72", 16, 128], "query_34": ["s72", 2048], "key_33": ["s72", 4, 128], "key_rot_11": ["s72", 4, 128], "key_pass_11": ["s72", 4, 0], "unsqueeze_47": ["s72", 1, 64], "cos_35": ["s72", 1, 64], "unsqueeze_48": ["s72", 1, 64], "sin_35": ["s72", 1, 64], "x1_23": ["s72", 4, 64], "x2_23": ["s72", 4, 64], "mul_152": ["s72", 4, 64], "mul_153": ["s72", 4, 64], "o1_23": ["s72", 4, 64], "mul_154": ["s72", 4, 64], "mul_155": ["s72", 4, 64], "o2_23": ["s72", 4, 64], "output_68": ["s72", 4, 128], "cat_47": ["s72", 4, 128], "key_34": ["s72", 512], "output_69": ["s72", 2048], "query_35": ["s72", 16, 128], "output_70": ["s72", 16, 128], "key_35": ["s72", 4, 128], "value_11": ["s72", 4, 128], "attn_output_11": ["s72", 2048], "output_parallel_46": ["s72", 4096], "output_71": ["s72", 4096], "_get_data_attr_23": [4096], "x_125": ["s72", 4096], "x_126": ["s72", 4096], "residual_22": ["s72", 4096], "pow_24": ["s72", 4096], "variance_23": ["s72", 1], "add_71": ["s72", 1], "rsqrt_23": ["s72", 1], "x_127": ["s72", 4096], "x_128": ["s72", 4096], "x_129": ["s72", 4096], "output_parallel_47": ["s72", 14336], "getitem_334": ["s72", 7168], "silu_11": ["s72", 7168], "getitem_335": ["s72", 7168], "x_130": ["s72", 7168], "output_parallel_48": ["s72", 4096], "output_72": ["s72", 4096], "_get_data_attr_24": [4096], "x_131": ["s72", 4096], "x_132": ["s72", 4096], "residual_23": ["s72", 4096], "pow_25": ["s72", 4096], "variance_24": ["s72", 1], "add_73": ["s72", 1], "rsqrt_24": ["s72", 1], "x_133": ["s72", 4096], "x_134": ["s72", 4096], "x_135": ["s72", 4096], "output_parallel_49": ["s72", 3072], "q_12": ["s72", 2048], "k_12": ["s72", 512], "v_12": ["s72", 512], "positions_12": ["s72"], "cos_sin_12": ["s72", 128], "cos_36": ["s72", 64], "sin_36": ["s72", 64], "query_36": ["s72", 16, 128], "query_rot_12": ["s72", 16, 128], "query_pass_12": ["s72", 16, 0], "unsqueeze_49": ["s72", 1, 64], "cos_37": ["s72", 1, 64], "unsqueeze_50": ["s72", 1, 64], "sin_37": ["s72", 1, 64], "x1_24": ["s72", 16, 64], "x2_24": ["s72", 16, 64], "mul_161": ["s72", 16, 64], "mul_162": ["s72", 16, 64], "o1_24": ["s72", 16, 64], "mul_163": ["s72", 16, 64], "mul_164": ["s72", 16, 64], "o2_24": ["s72", 16, 64], "output_73": ["s72", 16, 128], "cat_49": ["s72", 16, 128], "query_37": ["s72", 2048], "key_36": ["s72", 4, 128], "key_rot_12": ["s72", 4, 128], "key_pass_12": ["s72", 4, 0], "unsqueeze_51": ["s72", 1, 64], "cos_38": ["s72", 1, 64], "unsqueeze_52": ["s72", 1, 64], "sin_38": ["s72", 1, 64], "x1_25": ["s72", 4, 64], "x2_25": ["s72", 4, 64], "mul_165": ["s72", 4, 64], "mul_166": ["s72", 4, 64], "o1_25": ["s72", 4, 64], "mul_167": ["s72", 4, 64], "mul_168": ["s72", 4, 64], "o2_25": ["s72", 4, 64], "output_74": ["s72", 4, 128], "cat_51": ["s72", 4, 128], "key_37": ["s72", 512], "output_75": ["s72", 2048], "query_38": ["s72", 16, 128], "output_76": ["s72", 16, 128], "key_38": ["s72", 4, 128], "value_12": ["s72", 4, 128], "attn_output_12": ["s72", 2048], "output_parallel_50": ["s72", 4096], "output_77": ["s72", 4096], "_get_data_attr_25": [4096], "x_136": ["s72", 4096], "x_137": ["s72", 4096], "residual_24": ["s72", 4096], "pow_26": ["s72", 4096], "variance_25": ["s72", 1], "add_77": ["s72", 1], "rsqrt_25": ["s72", 1], "x_138": ["s72", 4096], "x_139": ["s72", 4096], "x_140": ["s72", 4096], "output_parallel_51": ["s72", 14336], "getitem_362": ["s72", 7168], "silu_12": ["s72", 7168], "getitem_363": ["s72", 7168], "x_141": ["s72", 7168], "output_parallel_52": ["s72", 4096], "output_78": ["s72", 4096], "_get_data_attr_26": [4096], "x_142": ["s72", 4096], "x_143": ["s72", 4096], "residual_25": ["s72", 4096], "pow_27": ["s72", 4096], "variance_26": ["s72", 1], "add_79": ["s72", 1], "rsqrt_26": ["s72", 1], "x_144": ["s72", 4096], "x_145": ["s72", 4096], "x_146": ["s72", 4096], "output_parallel_53": ["s72", 3072], "q_13": ["s72", 2048], "k_13": ["s72", 512], "v_13": ["s72", 512], "positions_13": ["s72"], "cos_sin_13": ["s72", 128], "cos_39": ["s72", 64], "sin_39": ["s72", 64], "query_39": ["s72", 16, 128], "query_rot_13": ["s72", 16, 128], "query_pass_13": ["s72", 16, 0], "unsqueeze_53": ["s72", 1, 64], "cos_40": ["s72", 1, 64], "unsqueeze_54": ["s72", 1, 64], "sin_40": ["s72", 1, 64], "x1_26": ["s72", 16, 64], "x2_26": ["s72", 16, 64], "mul_174": ["s72", 16, 64], "mul_175": ["s72", 16, 64], "o1_26": ["s72", 16, 64], "mul_176": ["s72", 16, 64], "mul_177": ["s72", 16, 64], "o2_26": ["s72", 16, 64], "output_79": ["s72", 16, 128], "cat_53": ["s72", 16, 128], "query_40": ["s72", 2048], "key_39": ["s72", 4, 128], "key_rot_13": ["s72", 4, 128], "key_pass_13": ["s72", 4, 0], "unsqueeze_55": ["s72", 1, 64], "cos_41": ["s72", 1, 64], "unsqueeze_56": ["s72", 1, 64], "sin_41": ["s72", 1, 64], "x1_27": ["s72", 4, 64], "x2_27": ["s72", 4, 64], "mul_178": ["s72", 4, 64], "mul_179": ["s72", 4, 64], "o1_27": ["s72", 4, 64], "mul_180": ["s72", 4, 64], "mul_181": ["s72", 4, 64], "o2_27": ["s72", 4, 64], "output_80": ["s72", 4, 128], "cat_55": ["s72", 4, 128], "key_40": ["s72", 512], "output_81": ["s72", 2048], "query_41": ["s72", 16, 128], "output_82": ["s72", 16, 128], "key_41": ["s72", 4, 128], "value_13": ["s72", 4, 128], "attn_output_13": ["s72", 2048], "output_parallel_54": ["s72", 4096], "output_83": ["s72", 4096], "_get_data_attr_27": [4096], "x_147": ["s72", 4096], "x_148": ["s72", 4096], "residual_26": ["s72", 4096], "pow_28": ["s72", 4096], "variance_27": ["s72", 1], "add_83": ["s72", 1], "rsqrt_27": ["s72", 1], "x_149": ["s72", 4096], "x_150": ["s72", 4096], "x_151": ["s72", 4096], "output_parallel_55": ["s72", 14336], "getitem_390": ["s72", 7168], "silu_13": ["s72", 7168], "getitem_391": ["s72", 7168], "x_152": ["s72", 7168], "output_parallel_56": ["s72", 4096], "output_84": ["s72", 4096], "_get_data_attr_28": [4096], "x_153": ["s72", 4096], "x_154": ["s72", 4096], "residual_27": ["s72", 4096], "pow_29": ["s72", 4096], "variance_28": ["s72", 1], "add_85": ["s72", 1], "rsqrt_28": ["s72", 1], "x_155": ["s72", 4096], "x_156": ["s72", 4096], "x_157": ["s72", 4096], "output_parallel_57": ["s72", 3072], "q_14": ["s72", 2048], "k_14": ["s72", 512], "v_14": ["s72", 512], "positions_14": ["s72"], "cos_sin_14": ["s72", 128], "cos_42": ["s72", 64], "sin_42": ["s72", 64], "query_42": ["s72", 16, 128], "query_rot_14": ["s72", 16, 128], "query_pass_14": ["s72", 16, 0], "unsqueeze_57": ["s72", 1, 64], "cos_43": ["s72", 1, 64], "unsqueeze_58": ["s72", 1, 64], "sin_43": ["s72", 1, 64], "x1_28": ["s72", 16, 64], "x2_28": ["s72", 16, 64], "mul_187": ["s72", 16, 64], "mul_188": ["s72", 16, 64], "o1_28": ["s72", 16, 64], "mul_189": ["s72", 16, 64], "mul_190": ["s72", 16, 64], "o2_28": ["s72", 16, 64], "output_85": ["s72", 16, 128], "cat_57": ["s72", 16, 128], "query_43": ["s72", 2048], "key_42": ["s72", 4, 128], "key_rot_14": ["s72", 4, 128], "key_pass_14": ["s72", 4, 0], "unsqueeze_59": ["s72", 1, 64], "cos_44": ["s72", 1, 64], "unsqueeze_60": ["s72", 1, 64], "sin_44": ["s72", 1, 64], "x1_29": ["s72", 4, 64], "x2_29": ["s72", 4, 64], "mul_191": ["s72", 4, 64], "mul_192": ["s72", 4, 64], "o1_29": ["s72", 4, 64], "mul_193": ["s72", 4, 64], "mul_194": ["s72", 4, 64], "o2_29": ["s72", 4, 64], "output_86": ["s72", 4, 128], "cat_59": ["s72", 4, 128], "key_43": ["s72", 512], "output_87": ["s72", 2048], "query_44": ["s72", 16, 128], "output_88": ["s72", 16, 128], "key_44": ["s72", 4, 128], "value_14": ["s72", 4, 128], "attn_output_14": ["s72", 2048], "output_parallel_58": ["s72", 4096], "output_89": ["s72", 4096], "_get_data_attr_29": [4096], "x_158": ["s72", 4096], "x_159": ["s72", 4096], "residual_28": ["s72", 4096], "pow_30": ["s72", 4096], "variance_29": ["s72", 1], "add_89": ["s72", 1], "rsqrt_29": ["s72", 1], "x_160": ["s72", 4096], "x_161": ["s72", 4096], "x_162": ["s72", 4096], "output_parallel_59": ["s72", 14336], "getitem_418": ["s72", 7168], "silu_14": ["s72", 7168], "getitem_419": ["s72", 7168], "x_163": ["s72", 7168], "output_parallel_60": ["s72", 4096], "output_90": ["s72", 4096], "_get_data_attr_30": [4096], "x_164": ["s72", 4096], "x_165": ["s72", 4096], "residual_29": ["s72", 4096], "pow_31": ["s72", 4096], "variance_30": ["s72", 1], "add_91": ["s72", 1], "rsqrt_30": ["s72", 1], "x_166": ["s72", 4096], "x_167": ["s72", 4096], "x_168": ["s72", 4096], "output_parallel_61": ["s72", 3072], "q_15": ["s72", 2048], "k_15": ["s72", 512], "v_15": ["s72", 512], "positions_15": ["s72"], "cos_sin_15": ["s72", 128], "cos_45": ["s72", 64], "sin_45": ["s72", 64], "query_45": ["s72", 16, 128], "query_rot_15": ["s72", 16, 128], "query_pass_15": ["s72", 16, 0], "unsqueeze_61": ["s72", 1, 64], "cos_46": ["s72", 1, 64], "unsqueeze_62": ["s72", 1, 64], "sin_46": ["s72", 1, 64], "x1_30": ["s72", 16, 64], "x2_30": ["s72", 16, 64], "mul_200": ["s72", 16, 64], "mul_201": ["s72", 16, 64], "o1_30": ["s72", 16, 64], "mul_202": ["s72", 16, 64], "mul_203": ["s72", 16, 64], "o2_30": ["s72", 16, 64], "output_91": ["s72", 16, 128], "cat_61": ["s72", 16, 128], "query_46": ["s72", 2048], "key_45": ["s72", 4, 128], "key_rot_15": ["s72", 4, 128], "key_pass_15": ["s72", 4, 0], "unsqueeze_63": ["s72", 1, 64], "cos_47": ["s72", 1, 64], "unsqueeze_64": ["s72", 1, 64], "sin_47": ["s72", 1, 64], "x1_31": ["s72", 4, 64], "x2_31": ["s72", 4, 64], "mul_204": ["s72", 4, 64], "mul_205": ["s72", 4, 64], "o1_31": ["s72", 4, 64], "mul_206": ["s72", 4, 64], "mul_207": ["s72", 4, 64], "o2_31": ["s72", 4, 64], "output_92": ["s72", 4, 128], "cat_63": ["s72", 4, 128], "key_46": ["s72", 512], "output_93": ["s72", 2048], "query_47": ["s72", 16, 128], "output_94": ["s72", 16, 128], "key_47": ["s72", 4, 128], "value_15": ["s72", 4, 128], "attn_output_15": ["s72", 2048], "output_parallel_62": ["s72", 4096], "output_95": ["s72", 4096], "_get_data_attr_31": [4096], "x_169": ["s72", 4096], "x_170": ["s72", 4096], "residual_30": ["s72", 4096], "pow_32": ["s72", 4096], "variance_31": ["s72", 1], "add_95": ["s72", 1], "rsqrt_31": ["s72", 1], "x_171": ["s72", 4096], "x_172": ["s72", 4096], "x_173": ["s72", 4096], "output_parallel_63": ["s72", 14336], "getitem_446": ["s72", 7168], "silu_15": ["s72", 7168], "getitem_447": ["s72", 7168], "x_174": ["s72", 7168], "output_parallel_64": ["s72", 4096], "output_96": ["s72", 4096], "_get_data_attr_32": [4096], "x_175": ["s72", 4096], "x_176": ["s72", 4096], "residual_31": ["s72", 4096], "pow_33": ["s72", 4096], "variance_32": ["s72", 1], "add_97": ["s72", 1], "rsqrt_32": ["s72", 1], "x_177": ["s72", 4096], "x_178": ["s72", 4096], "x_179": ["s72", 4096], "output_parallel_65": ["s72", 3072], "q_16": ["s72", 2048], "k_16": ["s72", 512], "v_16": ["s72", 512], "positions_16": ["s72"], "cos_sin_16": ["s72", 128], "cos_48": ["s72", 64], "sin_48": ["s72", 64], "query_48": ["s72", 16, 128], "query_rot_16": ["s72", 16, 128], "query_pass_16": ["s72", 16, 0], "unsqueeze_65": ["s72", 1, 64], "cos_49": ["s72", 1, 64], "unsqueeze_66": ["s72", 1, 64], "sin_49": ["s72", 1, 64], "x1_32": ["s72", 16, 64], "x2_32": ["s72", 16, 64], "mul_213": ["s72", 16, 64], "mul_214": ["s72", 16, 64], "o1_32": ["s72", 16, 64], "mul_215": ["s72", 16, 64], "mul_216": ["s72", 16, 64], "o2_32": ["s72", 16, 64], "output_97": ["s72", 16, 128], "cat_65": ["s72", 16, 128], "query_49": ["s72", 2048], "key_48": ["s72", 4, 128], "key_rot_16": ["s72", 4, 128], "key_pass_16": ["s72", 4, 0], "unsqueeze_67": ["s72", 1, 64], "cos_50": ["s72", 1, 64], "unsqueeze_68": ["s72", 1, 64], "sin_50": ["s72", 1, 64], "x1_33": ["s72", 4, 64], "x2_33": ["s72", 4, 64], "mul_217": ["s72", 4, 64], "mul_218": ["s72", 4, 64], "o1_33": ["s72", 4, 64], "mul_219": ["s72", 4, 64], "mul_220": ["s72", 4, 64], "o2_33": ["s72", 4, 64], "output_98": ["s72", 4, 128], "cat_67": ["s72", 4, 128], "key_49": ["s72", 512], "output_99": ["s72", 2048], "query_50": ["s72", 16, 128], "output_100": ["s72", 16, 128], "key_50": ["s72", 4, 128], "value_16": ["s72", 4, 128], "attn_output_16": ["s72", 2048], "output_parallel_66": ["s72", 4096], "output_101": ["s72", 4096], "_get_data_attr_33": [4096], "x_180": ["s72", 4096], "x_181": ["s72", 4096], "residual_32": ["s72", 4096], "pow_34": ["s72", 4096], "variance_33": ["s72", 1], "add_101": ["s72", 1], "rsqrt_33": ["s72", 1], "x_182": ["s72", 4096], "x_183": ["s72", 4096], "x_184": ["s72", 4096], "output_parallel_67": ["s72", 14336], "getitem_474": ["s72", 7168], "silu_16": ["s72", 7168], "getitem_475": ["s72", 7168], "x_185": ["s72", 7168], "output_parallel_68": ["s72", 4096], "output_102": ["s72", 4096], "_get_data_attr_34": [4096], "x_186": ["s72", 4096], "x_187": ["s72", 4096], "residual_33": ["s72", 4096], "pow_35": ["s72", 4096], "variance_34": ["s72", 1], "add_103": ["s72", 1], "rsqrt_34": ["s72", 1], "x_188": ["s72", 4096], "x_189": ["s72", 4096], "x_190": ["s72", 4096], "output_parallel_69": ["s72", 3072], "q_17": ["s72", 2048], "k_17": ["s72", 512], "v_17": ["s72", 512], "positions_17": ["s72"], "cos_sin_17": ["s72", 128], "cos_51": ["s72", 64], "sin_51": ["s72", 64], "query_51": ["s72", 16, 128], "query_rot_17": ["s72", 16, 128], "query_pass_17": ["s72", 16, 0], "unsqueeze_69": ["s72", 1, 64], "cos_52": ["s72", 1, 64], "unsqueeze_70": ["s72", 1, 64], "sin_52": ["s72", 1, 64], "x1_34": ["s72", 16, 64], "x2_34": ["s72", 16, 64], "mul_226": ["s72", 16, 64], "mul_227": ["s72", 16, 64], "o1_34": ["s72", 16, 64], "mul_228": ["s72", 16, 64], "mul_229": ["s72", 16, 64], "o2_34": ["s72", 16, 64], "output_103": ["s72", 16, 128], "cat_69": ["s72", 16, 128], "query_52": ["s72", 2048], "key_51": ["s72", 4, 128], "key_rot_17": ["s72", 4, 128], "key_pass_17": ["s72", 4, 0], "unsqueeze_71": ["s72", 1, 64], "cos_53": ["s72", 1, 64], "unsqueeze_72": ["s72", 1, 64], "sin_53": ["s72", 1, 64], "x1_35": ["s72", 4, 64], "x2_35": ["s72", 4, 64], "mul_230": ["s72", 4, 64], "mul_231": ["s72", 4, 64], "o1_35": ["s72", 4, 64], "mul_232": ["s72", 4, 64], "mul_233": ["s72", 4, 64], "o2_35": ["s72", 4, 64], "output_104": ["s72", 4, 128], "cat_71": ["s72", 4, 128], "key_52": ["s72", 512], "output_105": ["s72", 2048], "query_53": ["s72", 16, 128], "output_106": ["s72", 16, 128], "key_53": ["s72", 4, 128], "value_17": ["s72", 4, 128], "attn_output_17": ["s72", 2048], "output_parallel_70": ["s72", 4096], "output_107": ["s72", 4096], "_get_data_attr_35": [4096], "x_191": ["s72", 4096], "x_192": ["s72", 4096], "residual_34": ["s72", 4096], "pow_36": ["s72", 4096], "variance_35": ["s72", 1], "add_107": ["s72", 1], "rsqrt_35": ["s72", 1], "x_193": ["s72", 4096], "x_194": ["s72", 4096], "x_195": ["s72", 4096], "output_parallel_71": ["s72", 14336], "getitem_502": ["s72", 7168], "silu_17": ["s72", 7168], "getitem_503": ["s72", 7168], "x_196": ["s72", 7168], "output_parallel_72": ["s72", 4096], "output_108": ["s72", 4096], "_get_data_attr_36": [4096], "x_197": ["s72", 4096], "x_198": ["s72", 4096], "residual_35": ["s72", 4096], "pow_37": ["s72", 4096], "variance_36": ["s72", 1], "add_109": ["s72", 1], "rsqrt_36": ["s72", 1], "x_199": ["s72", 4096], "x_200": ["s72", 4096], "x_201": ["s72", 4096], "output_parallel_73": ["s72", 3072], "q_18": ["s72", 2048], "k_18": ["s72", 512], "v_18": ["s72", 512], "positions_18": ["s72"], "cos_sin_18": ["s72", 128], "cos_54": ["s72", 64], "sin_54": ["s72", 64], "query_54": ["s72", 16, 128], "query_rot_18": ["s72", 16, 128], "query_pass_18": ["s72", 16, 0], "unsqueeze_73": ["s72", 1, 64], "cos_55": ["s72", 1, 64], "unsqueeze_74": ["s72", 1, 64], "sin_55": ["s72", 1, 64], "x1_36": ["s72", 16, 64], "x2_36": ["s72", 16, 64], "mul_239": ["s72", 16, 64], "mul_240": ["s72", 16, 64], "o1_36": ["s72", 16, 64], "mul_241": ["s72", 16, 64], "mul_242": ["s72", 16, 64], "o2_36": ["s72", 16, 64], "output_109": ["s72", 16, 128], "cat_73": ["s72", 16, 128], "query_55": ["s72", 2048], "key_54": ["s72", 4, 128], "key_rot_18": ["s72", 4, 128], "key_pass_18": ["s72", 4, 0], "unsqueeze_75": ["s72", 1, 64], "cos_56": ["s72", 1, 64], "unsqueeze_76": ["s72", 1, 64], "sin_56": ["s72", 1, 64], "x1_37": ["s72", 4, 64], "x2_37": ["s72", 4, 64], "mul_243": ["s72", 4, 64], "mul_244": ["s72", 4, 64], "o1_37": ["s72", 4, 64], "mul_245": ["s72", 4, 64], "mul_246": ["s72", 4, 64], "o2_37": ["s72", 4, 64], "output_110": ["s72", 4, 128], "cat_75": ["s72", 4, 128], "key_55": ["s72", 512], "output_111": ["s72", 2048], "query_56": ["s72", 16, 128], "output_112": ["s72", 16, 128], "key_56": ["s72", 4, 128], "value_18": ["s72", 4, 128], "attn_output_18": ["s72", 2048], "output_parallel_74": ["s72", 4096], "output_113": ["s72", 4096], "_get_data_attr_37": [4096], "x_202": ["s72", 4096], "x_203": ["s72", 4096], "residual_36": ["s72", 4096], "pow_38": ["s72", 4096], "variance_37": ["s72", 1], "add_113": ["s72", 1], "rsqrt_37": ["s72", 1], "x_204": ["s72", 4096], "x_205": ["s72", 4096], "x_206": ["s72", 4096], "output_parallel_75": ["s72", 14336], "getitem_530": ["s72", 7168], "silu_18": ["s72", 7168], "getitem_531": ["s72", 7168], "x_207": ["s72", 7168], "output_parallel_76": ["s72", 4096], "output_114": ["s72", 4096], "_get_data_attr_38": [4096], "x_208": ["s72", 4096], "x_209": ["s72", 4096], "residual_37": ["s72", 4096], "pow_39": ["s72", 4096], "variance_38": ["s72", 1], "add_115": ["s72", 1], "rsqrt_38": ["s72", 1], "x_210": ["s72", 4096], "x_211": ["s72", 4096], "x_212": ["s72", 4096], "output_parallel_77": ["s72", 3072], "q_19": ["s72", 2048], "k_19": ["s72", 512], "v_19": ["s72", 512], "positions_19": ["s72"], "cos_sin_19": ["s72", 128], "cos_57": ["s72", 64], "sin_57": ["s72", 64], "query_57": ["s72", 16, 128], "query_rot_19": ["s72", 16, 128], "query_pass_19": ["s72", 16, 0], "unsqueeze_77": ["s72", 1, 64], "cos_58": ["s72", 1, 64], "unsqueeze_78": ["s72", 1, 64], "sin_58": ["s72", 1, 64], "x1_38": ["s72", 16, 64], "x2_38": ["s72", 16, 64], "mul_252": ["s72", 16, 64], "mul_253": ["s72", 16, 64], "o1_38": ["s72", 16, 64], "mul_254": ["s72", 16, 64], "mul_255": ["s72", 16, 64], "o2_38": ["s72", 16, 64], "output_115": ["s72", 16, 128], "cat_77": ["s72", 16, 128], "query_58": ["s72", 2048], "key_57": ["s72", 4, 128], "key_rot_19": ["s72", 4, 128], "key_pass_19": ["s72", 4, 0], "unsqueeze_79": ["s72", 1, 64], "cos_59": ["s72", 1, 64], "unsqueeze_80": ["s72", 1, 64], "sin_59": ["s72", 1, 64], "x1_39": ["s72", 4, 64], "x2_39": ["s72", 4, 64], "mul_256": ["s72", 4, 64], "mul_257": ["s72", 4, 64], "o1_39": ["s72", 4, 64], "mul_258": ["s72", 4, 64], "mul_259": ["s72", 4, 64], "o2_39": ["s72", 4, 64], "output_116": ["s72", 4, 128], "cat_79": ["s72", 4, 128], "key_58": ["s72", 512], "output_117": ["s72", 2048], "query_59": ["s72", 16, 128], "output_118": ["s72", 16, 128], "key_59": ["s72", 4, 128], "value_19": ["s72", 4, 128], "attn_output_19": ["s72", 2048], "output_parallel_78": ["s72", 4096], "output_119": ["s72", 4096], "_get_data_attr_39": [4096], "x_213": ["s72", 4096], "x_214": ["s72", 4096], "residual_38": ["s72", 4096], "pow_40": ["s72", 4096], "variance_39": ["s72", 1], "add_119": ["s72", 1], "rsqrt_39": ["s72", 1], "x_215": ["s72", 4096], "x_216": ["s72", 4096], "x_217": ["s72", 4096], "output_parallel_79": ["s72", 14336], "getitem_558": ["s72", 7168], "silu_19": ["s72", 7168], "getitem_559": ["s72", 7168], "x_218": ["s72", 7168], "output_parallel_80": ["s72", 4096], "output_120": ["s72", 4096], "_get_data_attr_40": [4096], "x_219": ["s72", 4096], "x_220": ["s72", 4096], "residual_39": ["s72", 4096], "pow_41": ["s72", 4096], "variance_40": ["s72", 1], "add_121": ["s72", 1], "rsqrt_40": ["s72", 1], "x_221": ["s72", 4096], "x_222": ["s72", 4096], "x_223": ["s72", 4096], "output_parallel_81": ["s72", 3072], "q_20": ["s72", 2048], "k_20": ["s72", 512], "v_20": ["s72", 512], "positions_20": ["s72"], "cos_sin_20": ["s72", 128], "cos_60": ["s72", 64], "sin_60": ["s72", 64], "query_60": ["s72", 16, 128], "query_rot_20": ["s72", 16, 128], "query_pass_20": ["s72", 16, 0], "unsqueeze_81": ["s72", 1, 64], "cos_61": ["s72", 1, 64], "unsqueeze_82": ["s72", 1, 64], "sin_61": ["s72", 1, 64], "x1_40": ["s72", 16, 64], "x2_40": ["s72", 16, 64], "mul_265": ["s72", 16, 64], "mul_266": ["s72", 16, 64], "o1_40": ["s72", 16, 64], "mul_267": ["s72", 16, 64], "mul_268": ["s72", 16, 64], "o2_40": ["s72", 16, 64], "output_121": ["s72", 16, 128], "cat_81": ["s72", 16, 128], "query_61": ["s72", 2048], "key_60": ["s72", 4, 128], "key_rot_20": ["s72", 4, 128], "key_pass_20": ["s72", 4, 0], "unsqueeze_83": ["s72", 1, 64], "cos_62": ["s72", 1, 64], "unsqueeze_84": ["s72", 1, 64], "sin_62": ["s72", 1, 64], "x1_41": ["s72", 4, 64], "x2_41": ["s72", 4, 64], "mul_269": ["s72", 4, 64], "mul_270": ["s72", 4, 64], "o1_41": ["s72", 4, 64], "mul_271": ["s72", 4, 64], "mul_272": ["s72", 4, 64], "o2_41": ["s72", 4, 64], "output_122": ["s72", 4, 128], "cat_83": ["s72", 4, 128], "key_61": ["s72", 512], "output_123": ["s72", 2048], "query_62": ["s72", 16, 128], "output_124": ["s72", 16, 128], "key_62": ["s72", 4, 128], "value_20": ["s72", 4, 128], "attn_output_20": ["s72", 2048], "output_parallel_82": ["s72", 4096], "output_125": ["s72", 4096], "_get_data_attr_41": [4096], "x_224": ["s72", 4096], "x_225": ["s72", 4096], "residual_40": ["s72", 4096], "pow_42": ["s72", 4096], "variance_41": ["s72", 1], "add_125": ["s72", 1], "rsqrt_41": ["s72", 1], "x_226": ["s72", 4096], "x_227": ["s72", 4096], "x_228": ["s72", 4096], "output_parallel_83": ["s72", 14336], "getitem_586": ["s72", 7168], "silu_20": ["s72", 7168], "getitem_587": ["s72", 7168], "x_229": ["s72", 7168], "output_parallel_84": ["s72", 4096], "output_126": ["s72", 4096], "_get_data_attr_42": [4096], "x_230": ["s72", 4096], "x_231": ["s72", 4096], "residual_41": ["s72", 4096], "pow_43": ["s72", 4096], "variance_42": ["s72", 1], "add_127": ["s72", 1], "rsqrt_42": ["s72", 1], "x_232": ["s72", 4096], "x_233": ["s72", 4096], "x_234": ["s72", 4096], "output_parallel_85": ["s72", 3072], "q_21": ["s72", 2048], "k_21": ["s72", 512], "v_21": ["s72", 512], "positions_21": ["s72"], "cos_sin_21": ["s72", 128], "cos_63": ["s72", 64], "sin_63": ["s72", 64], "query_63": ["s72", 16, 128], "query_rot_21": ["s72", 16, 128], "query_pass_21": ["s72", 16, 0], "unsqueeze_85": ["s72", 1, 64], "cos_64": ["s72", 1, 64], "unsqueeze_86": ["s72", 1, 64], "sin_64": ["s72", 1, 64], "x1_42": ["s72", 16, 64], "x2_42": ["s72", 16, 64], "mul_278": ["s72", 16, 64], "mul_279": ["s72", 16, 64], "o1_42": ["s72", 16, 64], "mul_280": ["s72", 16, 64], "mul_281": ["s72", 16, 64], "o2_42": ["s72", 16, 64], "output_127": ["s72", 16, 128], "cat_85": ["s72", 16, 128], "query_64": ["s72", 2048], "key_63": ["s72", 4, 128], "key_rot_21": ["s72", 4, 128], "key_pass_21": ["s72", 4, 0], "unsqueeze_87": ["s72", 1, 64], "cos_65": ["s72", 1, 64], "unsqueeze_88": ["s72", 1, 64], "sin_65": ["s72", 1, 64], "x1_43": ["s72", 4, 64], "x2_43": ["s72", 4, 64], "mul_282": ["s72", 4, 64], "mul_283": ["s72", 4, 64], "o1_43": ["s72", 4, 64], "mul_284": ["s72", 4, 64], "mul_285": ["s72", 4, 64], "o2_43": ["s72", 4, 64], "output_128": ["s72", 4, 128], "cat_87": ["s72", 4, 128], "key_64": ["s72", 512], "output_129": ["s72", 2048], "query_65": ["s72", 16, 128], "output_130": ["s72", 16, 128], "key_65": ["s72", 4, 128], "value_21": ["s72", 4, 128], "attn_output_21": ["s72", 2048], "output_parallel_86": ["s72", 4096], "output_131": ["s72", 4096], "_get_data_attr_43": [4096], "x_235": ["s72", 4096], "x_236": ["s72", 4096], "residual_42": ["s72", 4096], "pow_44": ["s72", 4096], "variance_43": ["s72", 1], "add_131": ["s72", 1], "rsqrt_43": ["s72", 1], "x_237": ["s72", 4096], "x_238": ["s72", 4096], "x_239": ["s72", 4096], "output_parallel_87": ["s72", 14336], "getitem_614": ["s72", 7168], "silu_21": ["s72", 7168], "getitem_615": ["s72", 7168], "x_240": ["s72", 7168], "output_parallel_88": ["s72", 4096], "output_132": ["s72", 4096], "_get_data_attr_44": [4096], "x_241": ["s72", 4096], "x_242": ["s72", 4096], "residual_43": ["s72", 4096], "pow_45": ["s72", 4096], "variance_44": ["s72", 1], "add_133": ["s72", 1], "rsqrt_44": ["s72", 1], "x_243": ["s72", 4096], "x_244": ["s72", 4096], "x_245": ["s72", 4096], "output_parallel_89": ["s72", 3072], "q_22": ["s72", 2048], "k_22": ["s72", 512], "v_22": ["s72", 512], "positions_22": ["s72"], "cos_sin_22": ["s72", 128], "cos_66": ["s72", 64], "sin_66": ["s72", 64], "query_66": ["s72", 16, 128], "query_rot_22": ["s72", 16, 128], "query_pass_22": ["s72", 16, 0], "unsqueeze_89": ["s72", 1, 64], "cos_67": ["s72", 1, 64], "unsqueeze_90": ["s72", 1, 64], "sin_67": ["s72", 1, 64], "x1_44": ["s72", 16, 64], "x2_44": ["s72", 16, 64], "mul_291": ["s72", 16, 64], "mul_292": ["s72", 16, 64], "o1_44": ["s72", 16, 64], "mul_293": ["s72", 16, 64], "mul_294": ["s72", 16, 64], "o2_44": ["s72", 16, 64], "output_133": ["s72", 16, 128], "cat_89": ["s72", 16, 128], "query_67": ["s72", 2048], "key_66": ["s72", 4, 128], "key_rot_22": ["s72", 4, 128], "key_pass_22": ["s72", 4, 0], "unsqueeze_91": ["s72", 1, 64], "cos_68": ["s72", 1, 64], "unsqueeze_92": ["s72", 1, 64], "sin_68": ["s72", 1, 64], "x1_45": ["s72", 4, 64], "x2_45": ["s72", 4, 64], "mul_295": ["s72", 4, 64], "mul_296": ["s72", 4, 64], "o1_45": ["s72", 4, 64], "mul_297": ["s72", 4, 64], "mul_298": ["s72", 4, 64], "o2_45": ["s72", 4, 64], "output_134": ["s72", 4, 128], "cat_91": ["s72", 4, 128], "key_67": ["s72", 512], "output_135": ["s72", 2048], "query_68": ["s72", 16, 128], "output_136": ["s72", 16, 128], "key_68": ["s72", 4, 128], "value_22": ["s72", 4, 128], "attn_output_22": ["s72", 2048], "output_parallel_90": ["s72", 4096], "output_137": ["s72", 4096], "_get_data_attr_45": [4096], "x_246": ["s72", 4096], "x_247": ["s72", 4096], "residual_44": ["s72", 4096], "pow_46": ["s72", 4096], "variance_45": ["s72", 1], "add_137": ["s72", 1], "rsqrt_45": ["s72", 1], "x_248": ["s72", 4096], "x_249": ["s72", 4096], "x_250": ["s72", 4096], "output_parallel_91": ["s72", 14336], "getitem_642": ["s72", 7168], "silu_22": ["s72", 7168], "getitem_643": ["s72", 7168], "x_251": ["s72", 7168], "output_parallel_92": ["s72", 4096], "output_138": ["s72", 4096], "_get_data_attr_46": [4096], "x_252": ["s72", 4096], "x_253": ["s72", 4096], "residual_45": ["s72", 4096], "pow_47": ["s72", 4096], "variance_46": ["s72", 1], "add_139": ["s72", 1], "rsqrt_46": ["s72", 1], "x_254": ["s72", 4096], "x_255": ["s72", 4096], "x_256": ["s72", 4096], "output_parallel_93": ["s72", 3072], "q_23": ["s72", 2048], "k_23": ["s72", 512], "v_23": ["s72", 512], "positions_23": ["s72"], "cos_sin_23": ["s72", 128], "cos_69": ["s72", 64], "sin_69": ["s72", 64], "query_69": ["s72", 16, 128], "query_rot_23": ["s72", 16, 128], "query_pass_23": ["s72", 16, 0], "unsqueeze_93": ["s72", 1, 64], "cos_70": ["s72", 1, 64], "unsqueeze_94": ["s72", 1, 64], "sin_70": ["s72", 1, 64], "x1_46": ["s72", 16, 64], "x2_46": ["s72", 16, 64], "mul_304": ["s72", 16, 64], "mul_305": ["s72", 16, 64], "o1_46": ["s72", 16, 64], "mul_306": ["s72", 16, 64], "mul_307": ["s72", 16, 64], "o2_46": ["s72", 16, 64], "output_139": ["s72", 16, 128], "cat_93": ["s72", 16, 128], "query_70": ["s72", 2048], "key_69": ["s72", 4, 128], "key_rot_23": ["s72", 4, 128], "key_pass_23": ["s72", 4, 0], "unsqueeze_95": ["s72", 1, 64], "cos_71": ["s72", 1, 64], "unsqueeze_96": ["s72", 1, 64], "sin_71": ["s72", 1, 64], "x1_47": ["s72", 4, 64], "x2_47": ["s72", 4, 64], "mul_308": ["s72", 4, 64], "mul_309": ["s72", 4, 64], "o1_47": ["s72", 4, 64], "mul_310": ["s72", 4, 64], "mul_311": ["s72", 4, 64], "o2_47": ["s72", 4, 64], "output_140": ["s72", 4, 128], "cat_95": ["s72", 4, 128], "key_70": ["s72", 512], "output_141": ["s72", 2048], "query_71": ["s72", 16, 128], "output_142": ["s72", 16, 128], "key_71": ["s72", 4, 128], "value_23": ["s72", 4, 128], "attn_output_23": ["s72", 2048], "output_parallel_94": ["s72", 4096], "output_143": ["s72", 4096], "_get_data_attr_47": [4096], "x_257": ["s72", 4096], "x_258": ["s72", 4096], "residual_46": ["s72", 4096], "pow_48": ["s72", 4096], "variance_47": ["s72", 1], "add_143": ["s72", 1], "rsqrt_47": ["s72", 1], "x_259": ["s72", 4096], "x_260": ["s72", 4096], "x_261": ["s72", 4096], "output_parallel_95": ["s72", 14336], "getitem_670": ["s72", 7168], "silu_23": ["s72", 7168], "getitem_671": ["s72", 7168], "x_262": ["s72", 7168], "output_parallel_96": ["s72", 4096], "output_144": ["s72", 4096], "_get_data_attr_48": [4096], "x_263": ["s72", 4096], "x_264": ["s72", 4096], "residual_47": ["s72", 4096], "pow_49": ["s72", 4096], "variance_48": ["s72", 1], "add_145": ["s72", 1], "rsqrt_48": ["s72", 1], "x_265": ["s72", 4096], "x_266": ["s72", 4096], "x_267": ["s72", 4096], "output_parallel_97": ["s72", 3072], "q_24": ["s72", 2048], "k_24": ["s72", 512], "v_24": ["s72", 512], "positions_24": ["s72"], "cos_sin_24": ["s72", 128], "cos_72": ["s72", 64], "sin_72": ["s72", 64], "query_72": ["s72", 16, 128], "query_rot_24": ["s72", 16, 128], "query_pass_24": ["s72", 16, 0], "unsqueeze_97": ["s72", 1, 64], "cos_73": ["s72", 1, 64], "unsqueeze_98": ["s72", 1, 64], "sin_73": ["s72", 1, 64], "x1_48": ["s72", 16, 64], "x2_48": ["s72", 16, 64], "mul_317": ["s72", 16, 64], "mul_318": ["s72", 16, 64], "o1_48": ["s72", 16, 64], "mul_319": ["s72", 16, 64], "mul_320": ["s72", 16, 64], "o2_48": ["s72", 16, 64], "output_145": ["s72", 16, 128], "cat_97": ["s72", 16, 128], "query_73": ["s72", 2048], "key_72": ["s72", 4, 128], "key_rot_24": ["s72", 4, 128], "key_pass_24": ["s72", 4, 0], "unsqueeze_99": ["s72", 1, 64], "cos_74": ["s72", 1, 64], "unsqueeze_100": ["s72", 1, 64], "sin_74": ["s72", 1, 64], "x1_49": ["s72", 4, 64], "x2_49": ["s72", 4, 64], "mul_321": ["s72", 4, 64], "mul_322": ["s72", 4, 64], "o1_49": ["s72", 4, 64], "mul_323": ["s72", 4, 64], "mul_324": ["s72", 4, 64], "o2_49": ["s72", 4, 64], "output_146": ["s72", 4, 128], "cat_99": ["s72", 4, 128], "key_73": ["s72", 512], "output_147": ["s72", 2048], "query_74": ["s72", 16, 128], "output_148": ["s72", 16, 128], "key_74": ["s72", 4, 128], "value_24": ["s72", 4, 128], "attn_output_24": ["s72", 2048], "output_parallel_98": ["s72", 4096], "output_149": ["s72", 4096], "_get_data_attr_49": [4096], "x_268": ["s72", 4096], "x_269": ["s72", 4096], "residual_48": ["s72", 4096], "pow_50": ["s72", 4096], "variance_49": ["s72", 1], "add_149": ["s72", 1], "rsqrt_49": ["s72", 1], "x_270": ["s72", 4096], "x_271": ["s72", 4096], "x_272": ["s72", 4096], "output_parallel_99": ["s72", 14336], "getitem_698": ["s72", 7168], "silu_24": ["s72", 7168], "getitem_699": ["s72", 7168], "x_273": ["s72", 7168], "output_parallel_100": ["s72", 4096], "output_150": ["s72", 4096], "_get_data_attr_50": [4096], "x_274": ["s72", 4096], "x_275": ["s72", 4096], "residual_49": ["s72", 4096], "pow_51": ["s72", 4096], "variance_50": ["s72", 1], "add_151": ["s72", 1], "rsqrt_50": ["s72", 1], "x_276": ["s72", 4096], "x_277": ["s72", 4096], "x_278": ["s72", 4096], "output_parallel_101": ["s72", 3072], "q_25": ["s72", 2048], "k_25": ["s72", 512], "v_25": ["s72", 512], "positions_25": ["s72"], "cos_sin_25": ["s72", 128], "cos_75": ["s72", 64], "sin_75": ["s72", 64], "query_75": ["s72", 16, 128], "query_rot_25": ["s72", 16, 128], "query_pass_25": ["s72", 16, 0], "unsqueeze_101": ["s72", 1, 64], "cos_76": ["s72", 1, 64], "unsqueeze_102": ["s72", 1, 64], "sin_76": ["s72", 1, 64], "x1_50": ["s72", 16, 64], "x2_50": ["s72", 16, 64], "mul_330": ["s72", 16, 64], "mul_331": ["s72", 16, 64], "o1_50": ["s72", 16, 64], "mul_332": ["s72", 16, 64], "mul_333": ["s72", 16, 64], "o2_50": ["s72", 16, 64], "output_151": ["s72", 16, 128], "cat_101": ["s72", 16, 128], "query_76": ["s72", 2048], "key_75": ["s72", 4, 128], "key_rot_25": ["s72", 4, 128], "key_pass_25": ["s72", 4, 0], "unsqueeze_103": ["s72", 1, 64], "cos_77": ["s72", 1, 64], "unsqueeze_104": ["s72", 1, 64], "sin_77": ["s72", 1, 64], "x1_51": ["s72", 4, 64], "x2_51": ["s72", 4, 64], "mul_334": ["s72", 4, 64], "mul_335": ["s72", 4, 64], "o1_51": ["s72", 4, 64], "mul_336": ["s72", 4, 64], "mul_337": ["s72", 4, 64], "o2_51": ["s72", 4, 64], "output_152": ["s72", 4, 128], "cat_103": ["s72", 4, 128], "key_76": ["s72", 512], "output_153": ["s72", 2048], "query_77": ["s72", 16, 128], "output_154": ["s72", 16, 128], "key_77": ["s72", 4, 128], "value_25": ["s72", 4, 128], "attn_output_25": ["s72", 2048], "output_parallel_102": ["s72", 4096], "output_155": ["s72", 4096], "_get_data_attr_51": [4096], "x_279": ["s72", 4096], "x_280": ["s72", 4096], "residual_50": ["s72", 4096], "pow_52": ["s72", 4096], "variance_51": ["s72", 1], "add_155": ["s72", 1], "rsqrt_51": ["s72", 1], "x_281": ["s72", 4096], "x_282": ["s72", 4096], "x_283": ["s72", 4096], "output_parallel_103": ["s72", 14336], "getitem_726": ["s72", 7168], "silu_25": ["s72", 7168], "getitem_727": ["s72", 7168], "x_284": ["s72", 7168], "output_parallel_104": ["s72", 4096], "output_156": ["s72", 4096], "_get_data_attr_52": [4096], "x_285": ["s72", 4096], "x_286": ["s72", 4096], "residual_51": ["s72", 4096], "pow_53": ["s72", 4096], "variance_52": ["s72", 1], "add_157": ["s72", 1], "rsqrt_52": ["s72", 1], "x_287": ["s72", 4096], "x_288": ["s72", 4096], "x_289": ["s72", 4096], "output_parallel_105": ["s72", 3072], "q_26": ["s72", 2048], "k_26": ["s72", 512], "v_26": ["s72", 512], "positions_26": ["s72"], "cos_sin_26": ["s72", 128], "cos_78": ["s72", 64], "sin_78": ["s72", 64], "query_78": ["s72", 16, 128], "query_rot_26": ["s72", 16, 128], "query_pass_26": ["s72", 16, 0], "unsqueeze_105": ["s72", 1, 64], "cos_79": ["s72", 1, 64], "unsqueeze_106": ["s72", 1, 64], "sin_79": ["s72", 1, 64], "x1_52": ["s72", 16, 64], "x2_52": ["s72", 16, 64], "mul_343": ["s72", 16, 64], "mul_344": ["s72", 16, 64], "o1_52": ["s72", 16, 64], "mul_345": ["s72", 16, 64], "mul_346": ["s72", 16, 64], "o2_52": ["s72", 16, 64], "output_157": ["s72", 16, 128], "cat_105": ["s72", 16, 128], "query_79": ["s72", 2048], "key_78": ["s72", 4, 128], "key_rot_26": ["s72", 4, 128], "key_pass_26": ["s72", 4, 0], "unsqueeze_107": ["s72", 1, 64], "cos_80": ["s72", 1, 64], "unsqueeze_108": ["s72", 1, 64], "sin_80": ["s72", 1, 64], "x1_53": ["s72", 4, 64], "x2_53": ["s72", 4, 64], "mul_347": ["s72", 4, 64], "mul_348": ["s72", 4, 64], "o1_53": ["s72", 4, 64], "mul_349": ["s72", 4, 64], "mul_350": ["s72", 4, 64], "o2_53": ["s72", 4, 64], "output_158": ["s72", 4, 128], "cat_107": ["s72", 4, 128], "key_79": ["s72", 512], "output_159": ["s72", 2048], "query_80": ["s72", 16, 128], "output_160": ["s72", 16, 128], "key_80": ["s72", 4, 128], "value_26": ["s72", 4, 128], "attn_output_26": ["s72", 2048], "output_parallel_106": ["s72", 4096], "output_161": ["s72", 4096], "_get_data_attr_53": [4096], "x_290": ["s72", 4096], "x_291": ["s72", 4096], "residual_52": ["s72", 4096], "pow_54": ["s72", 4096], "variance_53": ["s72", 1], "add_161": ["s72", 1], "rsqrt_53": ["s72", 1], "x_292": ["s72", 4096], "x_293": ["s72", 4096], "x_294": ["s72", 4096], "output_parallel_107": ["s72", 14336], "getitem_754": ["s72", 7168], "silu_26": ["s72", 7168], "getitem_755": ["s72", 7168], "x_295": ["s72", 7168], "output_parallel_108": ["s72", 4096], "output_162": ["s72", 4096], "_get_data_attr_54": [4096], "x_296": ["s72", 4096], "x_297": ["s72", 4096], "residual_53": ["s72", 4096], "pow_55": ["s72", 4096], "variance_54": ["s72", 1], "add_163": ["s72", 1], "rsqrt_54": ["s72", 1], "x_298": ["s72", 4096], "x_299": ["s72", 4096], "x_300": ["s72", 4096], "output_parallel_109": ["s72", 3072], "q_27": ["s72", 2048], "k_27": ["s72", 512], "v_27": ["s72", 512], "positions_27": ["s72"], "cos_sin_27": ["s72", 128], "cos_81": ["s72", 64], "sin_81": ["s72", 64], "query_81": ["s72", 16, 128], "query_rot_27": ["s72", 16, 128], "query_pass_27": ["s72", 16, 0], "unsqueeze_109": ["s72", 1, 64], "cos_82": ["s72", 1, 64], "unsqueeze_110": ["s72", 1, 64], "sin_82": ["s72", 1, 64], "x1_54": ["s72", 16, 64], "x2_54": ["s72", 16, 64], "mul_356": ["s72", 16, 64], "mul_357": ["s72", 16, 64], "o1_54": ["s72", 16, 64], "mul_358": ["s72", 16, 64], "mul_359": ["s72", 16, 64], "o2_54": ["s72", 16, 64], "output_163": ["s72", 16, 128], "cat_109": ["s72", 16, 128], "query_82": ["s72", 2048], "key_81": ["s72", 4, 128], "key_rot_27": ["s72", 4, 128], "key_pass_27": ["s72", 4, 0], "unsqueeze_111": ["s72", 1, 64], "cos_83": ["s72", 1, 64], "unsqueeze_112": ["s72", 1, 64], "sin_83": ["s72", 1, 64], "x1_55": ["s72", 4, 64], "x2_55": ["s72", 4, 64], "mul_360": ["s72", 4, 64], "mul_361": ["s72", 4, 64], "o1_55": ["s72", 4, 64], "mul_362": ["s72", 4, 64], "mul_363": ["s72", 4, 64], "o2_55": ["s72", 4, 64], "output_164": ["s72", 4, 128], "cat_111": ["s72", 4, 128], "key_82": ["s72", 512], "output_165": ["s72", 2048], "query_83": ["s72", 16, 128], "output_166": ["s72", 16, 128], "key_83": ["s72", 4, 128], "value_27": ["s72", 4, 128], "attn_output_27": ["s72", 2048], "output_parallel_110": ["s72", 4096], "output_167": ["s72", 4096], "_get_data_attr_55": [4096], "x_301": ["s72", 4096], "x_302": ["s72", 4096], "residual_54": ["s72", 4096], "pow_56": ["s72", 4096], "variance_55": ["s72", 1], "add_167": ["s72", 1], "rsqrt_55": ["s72", 1], "x_303": ["s72", 4096], "x_304": ["s72", 4096], "x_305": ["s72", 4096], "output_parallel_111": ["s72", 14336], "getitem_782": ["s72", 7168], "silu_27": ["s72", 7168], "getitem_783": ["s72", 7168], "x_306": ["s72", 7168], "output_parallel_112": ["s72", 4096], "output_168": ["s72", 4096], "_get_data_attr_56": [4096], "x_307": ["s72", 4096], "x_308": ["s72", 4096], "residual_55": ["s72", 4096], "pow_57": ["s72", 4096], "variance_56": ["s72", 1], "add_169": ["s72", 1], "rsqrt_56": ["s72", 1], "x_309": ["s72", 4096], "x_310": ["s72", 4096], "x_311": ["s72", 4096], "output_parallel_113": ["s72", 3072], "q_28": ["s72", 2048], "k_28": ["s72", 512], "v_28": ["s72", 512], "positions_28": ["s72"], "cos_sin_28": ["s72", 128], "cos_84": ["s72", 64], "sin_84": ["s72", 64], "query_84": ["s72", 16, 128], "query_rot_28": ["s72", 16, 128], "query_pass_28": ["s72", 16, 0], "unsqueeze_113": ["s72", 1, 64], "cos_85": ["s72", 1, 64], "unsqueeze_114": ["s72", 1, 64], "sin_85": ["s72", 1, 64], "x1_56": ["s72", 16, 64], "x2_56": ["s72", 16, 64], "mul_369": ["s72", 16, 64], "mul_370": ["s72", 16, 64], "o1_56": ["s72", 16, 64], "mul_371": ["s72", 16, 64], "mul_372": ["s72", 16, 64], "o2_56": ["s72", 16, 64], "output_169": ["s72", 16, 128], "cat_113": ["s72", 16, 128], "query_85": ["s72", 2048], "key_84": ["s72", 4, 128], "key_rot_28": ["s72", 4, 128], "key_pass_28": ["s72", 4, 0], "unsqueeze_115": ["s72", 1, 64], "cos_86": ["s72", 1, 64], "unsqueeze_116": ["s72", 1, 64], "sin_86": ["s72", 1, 64], "x1_57": ["s72", 4, 64], "x2_57": ["s72", 4, 64], "mul_373": ["s72", 4, 64], "mul_374": ["s72", 4, 64], "o1_57": ["s72", 4, 64], "mul_375": ["s72", 4, 64], "mul_376": ["s72", 4, 64], "o2_57": ["s72", 4, 64], "output_170": ["s72", 4, 128], "cat_115": ["s72", 4, 128], "key_85": ["s72", 512], "output_171": ["s72", 2048], "query_86": ["s72", 16, 128], "output_172": ["s72", 16, 128], "key_86": ["s72", 4, 128], "value_28": ["s72", 4, 128], "attn_output_28": ["s72", 2048], "output_parallel_114": ["s72", 4096], "output_173": ["s72", 4096], "_get_data_attr_57": [4096], "x_312": ["s72", 4096], "x_313": ["s72", 4096], "residual_56": ["s72", 4096], "pow_58": ["s72", 4096], "variance_57": ["s72", 1], "add_173": ["s72", 1], "rsqrt_57": ["s72", 1], "x_314": ["s72", 4096], "x_315": ["s72", 4096], "x_316": ["s72", 4096], "output_parallel_115": ["s72", 14336], "getitem_810": ["s72", 7168], "silu_28": ["s72", 7168], "getitem_811": ["s72", 7168], "x_317": ["s72", 7168], "output_parallel_116": ["s72", 4096], "output_174": ["s72", 4096], "_get_data_attr_58": [4096], "x_318": ["s72", 4096], "x_319": ["s72", 4096], "residual_57": ["s72", 4096], "pow_59": ["s72", 4096], "variance_58": ["s72", 1], "add_175": ["s72", 1], "rsqrt_58": ["s72", 1], "x_320": ["s72", 4096], "x_321": ["s72", 4096], "x_322": ["s72", 4096], "output_parallel_117": ["s72", 3072], "q_29": ["s72", 2048], "k_29": ["s72", 512], "v_29": ["s72", 512], "positions_29": ["s72"], "cos_sin_29": ["s72", 128], "cos_87": ["s72", 64], "sin_87": ["s72", 64], "query_87": ["s72", 16, 128], "query_rot_29": ["s72", 16, 128], "query_pass_29": ["s72", 16, 0], "unsqueeze_117": ["s72", 1, 64], "cos_88": ["s72", 1, 64], "unsqueeze_118": ["s72", 1, 64], "sin_88": ["s72", 1, 64], "x1_58": ["s72", 16, 64], "x2_58": ["s72", 16, 64], "mul_382": ["s72", 16, 64], "mul_383": ["s72", 16, 64], "o1_58": ["s72", 16, 64], "mul_384": ["s72", 16, 64], "mul_385": ["s72", 16, 64], "o2_58": ["s72", 16, 64], "output_175": ["s72", 16, 128], "cat_117": ["s72", 16, 128], "query_88": ["s72", 2048], "key_87": ["s72", 4, 128], "key_rot_29": ["s72", 4, 128], "key_pass_29": ["s72", 4, 0], "unsqueeze_119": ["s72", 1, 64], "cos_89": ["s72", 1, 64], "unsqueeze_120": ["s72", 1, 64], "sin_89": ["s72", 1, 64], "x1_59": ["s72", 4, 64], "x2_59": ["s72", 4, 64], "mul_386": ["s72", 4, 64], "mul_387": ["s72", 4, 64], "o1_59": ["s72", 4, 64], "mul_388": ["s72", 4, 64], "mul_389": ["s72", 4, 64], "o2_59": ["s72", 4, 64], "output_176": ["s72", 4, 128], "cat_119": ["s72", 4, 128], "key_88": ["s72", 512], "output_177": ["s72", 2048], "query_89": ["s72", 16, 128], "output_178": ["s72", 16, 128], "key_89": ["s72", 4, 128], "value_29": ["s72", 4, 128], "attn_output_29": ["s72", 2048], "output_parallel_118": ["s72", 4096], "output_179": ["s72", 4096], "_get_data_attr_59": [4096], "x_323": ["s72", 4096], "x_324": ["s72", 4096], "residual_58": ["s72", 4096], "pow_60": ["s72", 4096], "variance_59": ["s72", 1], "add_179": ["s72", 1], "rsqrt_59": ["s72", 1], "x_325": ["s72", 4096], "x_326": ["s72", 4096], "x_327": ["s72", 4096], "output_parallel_119": ["s72", 14336], "getitem_838": ["s72", 7168], "silu_29": ["s72", 7168], "getitem_839": ["s72", 7168], "x_328": ["s72", 7168], "output_parallel_120": ["s72", 4096], "output_180": ["s72", 4096], "_get_data_attr_60": [4096], "x_329": ["s72", 4096], "x_330": ["s72", 4096], "residual_59": ["s72", 4096], "pow_61": ["s72", 4096], "variance_60": ["s72", 1], "add_181": ["s72", 1], "rsqrt_60": ["s72", 1], "x_331": ["s72", 4096], "x_332": ["s72", 4096], "x_333": ["s72", 4096], "output_parallel_121": ["s72", 3072], "q_30": ["s72", 2048], "k_30": ["s72", 512], "v_30": ["s72", 512], "positions_30": ["s72"], "cos_sin_30": ["s72", 128], "cos_90": ["s72", 64], "sin_90": ["s72", 64], "query_90": ["s72", 16, 128], "query_rot_30": ["s72", 16, 128], "query_pass_30": ["s72", 16, 0], "unsqueeze_121": ["s72", 1, 64], "cos_91": ["s72", 1, 64], "unsqueeze_122": ["s72", 1, 64], "sin_91": ["s72", 1, 64], "x1_60": ["s72", 16, 64], "x2_60": ["s72", 16, 64], "mul_395": ["s72", 16, 64], "mul_396": ["s72", 16, 64], "o1_60": ["s72", 16, 64], "mul_397": ["s72", 16, 64], "mul_398": ["s72", 16, 64], "o2_60": ["s72", 16, 64], "output_181": ["s72", 16, 128], "cat_121": ["s72", 16, 128], "query_91": ["s72", 2048], "key_90": ["s72", 4, 128], "key_rot_30": ["s72", 4, 128], "key_pass_30": ["s72", 4, 0], "unsqueeze_123": ["s72", 1, 64], "cos_92": ["s72", 1, 64], "unsqueeze_124": ["s72", 1, 64], "sin_92": ["s72", 1, 64], "x1_61": ["s72", 4, 64], "x2_61": ["s72", 4, 64], "mul_399": ["s72", 4, 64], "mul_400": ["s72", 4, 64], "o1_61": ["s72", 4, 64], "mul_401": ["s72", 4, 64], "mul_402": ["s72", 4, 64], "o2_61": ["s72", 4, 64], "output_182": ["s72", 4, 128], "cat_123": ["s72", 4, 128], "key_91": ["s72", 512], "output_183": ["s72", 2048], "query_92": ["s72", 16, 128], "output_184": ["s72", 16, 128], "key_92": ["s72", 4, 128], "value_30": ["s72", 4, 128], "attn_output_30": ["s72", 2048], "output_parallel_122": ["s72", 4096], "output_185": ["s72", 4096], "_get_data_attr_61": [4096], "x_334": ["s72", 4096], "x_335": ["s72", 4096], "residual_60": ["s72", 4096], "pow_62": ["s72", 4096], "variance_61": ["s72", 1], "add_185": ["s72", 1], "rsqrt_61": ["s72", 1], "x_336": ["s72", 4096], "x_337": ["s72", 4096], "x_338": ["s72", 4096], "output_parallel_123": ["s72", 14336], "getitem_866": ["s72", 7168], "silu_30": ["s72", 7168], "getitem_867": ["s72", 7168], "x_339": ["s72", 7168], "output_parallel_124": ["s72", 4096], "output_186": ["s72", 4096], "_get_data_attr_62": [4096], "x_340": ["s72", 4096], "x_341": ["s72", 4096], "residual_61": ["s72", 4096], "pow_63": ["s72", 4096], "variance_62": ["s72", 1], "add_187": ["s72", 1], "rsqrt_62": ["s72", 1], "x_342": ["s72", 4096], "x_343": ["s72", 4096], "x_344": ["s72", 4096], "output_parallel_125": ["s72", 3072], "q_31": ["s72", 2048], "k_31": ["s72", 512], "v_31": ["s72", 512], "positions_31": ["s72"], "cos_sin_31": ["s72", 128], "cos_93": ["s72", 64], "sin_93": ["s72", 64], "query_93": ["s72", 16, 128], "query_rot_31": ["s72", 16, 128], "query_pass_31": ["s72", 16, 0], "unsqueeze_125": ["s72", 1, 64], "cos_94": ["s72", 1, 64], "unsqueeze_126": ["s72", 1, 64], "sin_94": ["s72", 1, 64], "x1_62": ["s72", 16, 64], "x2_62": ["s72", 16, 64], "mul_408": ["s72", 16, 64], "mul_409": ["s72", 16, 64], "o1_62": ["s72", 16, 64], "mul_410": ["s72", 16, 64], "mul_411": ["s72", 16, 64], "o2_62": ["s72", 16, 64], "output_187": ["s72", 16, 128], "cat_125": ["s72", 16, 128], "query_94": ["s72", 2048], "key_93": ["s72", 4, 128], "key_rot_31": ["s72", 4, 128], "key_pass_31": ["s72", 4, 0], "unsqueeze_127": ["s72", 1, 64], "cos_95": ["s72", 1, 64], "unsqueeze_128": ["s72", 1, 64], "sin_95": ["s72", 1, 64], "x1_63": ["s72", 4, 64], "x2_63": ["s72", 4, 64], "mul_412": ["s72", 4, 64], "mul_413": ["s72", 4, 64], "o1_63": ["s72", 4, 64], "mul_414": ["s72", 4, 64], "mul_415": ["s72", 4, 64], "o2_63": ["s72", 4, 64], "output_188": ["s72", 4, 128], "cat_127": ["s72", 4, 128], "key_94": ["s72", 512], "output_189": ["s72", 2048], "query_95": ["s72", 16, 128], "output_190": ["s72", 16, 128], "key_95": ["s72", 4, 128], "value_31": ["s72", 4, 128], "attn_output_31": ["s72", 2048], "output_parallel_126": ["s72", 4096], "output_191": ["s72", 4096], "_get_data_attr_63": [4096], "x_345": ["s72", 4096], "x_346": ["s72", 4096], "residual_62": ["s72", 4096], "pow_64": ["s72", 4096], "variance_63": ["s72", 1], "add_191": ["s72", 1], "rsqrt_63": ["s72", 1], "x_347": ["s72", 4096], "x_348": ["s72", 4096], "x_349": ["s72", 4096], "output_parallel_127": ["s72", 14336], "getitem_894": ["s72", 7168], "silu_31": ["s72", 7168], "getitem_895": ["s72", 7168], "x_350": ["s72", 7168], "output_parallel_128": ["s72", 4096], "output_192": ["s72", 4096], "_get_data_attr_64": [4096], "x_351": ["s72", 4096], "x_352": ["s72", 4096], "residual_63": ["s72", 4096], "pow_65": ["s72", 4096], "variance_64": ["s72", 1], "add_193": ["s72", 1], "rsqrt_64": ["s72", 1], "x_353": ["s72", 4096], "x_354": ["s72", 4096], "x_355": ["s72", 4096]}}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "8bea56466f993132fd99de55fe333072"} + class GraphModule(torch.nn.Module): + def forward(self, s72: "Sym(s72)", L_input_ids_: "i32[s72][1]cuda:0", L_self_modules_embed_tokens_parameters_weight_: "bf16[64128, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128][128, 1]cuda:0", s80: "Sym(s72)", L_positions_: "i64[s72][1]cuda:0", L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048][2048, 1]cuda:0", L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: "bf16[4096][1]cuda:0", L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096][4096, 1]cuda:0", L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168][7168, 1]cuda:0", L_self_modules_norm_parameters_weight_: "bf16[4096][1]cuda:0"): + l_input_ids_ = L_input_ids_ + l_self_modules_embed_tokens_parameters_weight_ = L_self_modules_embed_tokens_parameters_weight_ + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = L_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ + l_positions_ = L_positions_ + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_norm_parameters_weight_ = L_self_modules_norm_parameters_weight_ + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:167 in get_masked_input_and_mask, code: org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) + ge: "b8[s72][1]cuda:0" = l_input_ids_ >= 0 + lt: "b8[s72][1]cuda:0" = l_input_ids_ < 64128 + org_vocab_mask: "b8[s72][1]cuda:0" = ge & lt; ge = lt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:168 in get_masked_input_and_mask, code: added_vocab_mask = (input_ >= added_vocab_start_index) & ( + ge_1: "b8[s72][1]cuda:0" = l_input_ids_ >= 128256 + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:169 in get_masked_input_and_mask, code: input_ < added_vocab_end_index + lt_1: "b8[s72][1]cuda:0" = l_input_ids_ < 128256 + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:168 in get_masked_input_and_mask, code: added_vocab_mask = (input_ >= added_vocab_start_index) & ( + added_vocab_mask: "b8[s72][1]cuda:0" = ge_1 & lt_1; ge_1 = lt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:176 in get_masked_input_and_mask, code: valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + mul: "i64[s72][1]cuda:0" = 0 * org_vocab_mask + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:177 in get_masked_input_and_mask, code: added_offset * added_vocab_mask + mul_1: "i64[s72][1]cuda:0" = 64128 * added_vocab_mask + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:176 in get_masked_input_and_mask, code: valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + valid_offset: "i64[s72][1]cuda:0" = mul + mul_1; mul = mul_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:179 in get_masked_input_and_mask, code: vocab_mask = org_vocab_mask | added_vocab_mask + vocab_mask: "b8[s72][1]cuda:0" = org_vocab_mask | added_vocab_mask; org_vocab_mask = added_vocab_mask = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:180 in get_masked_input_and_mask, code: input_ = vocab_mask * (input_ - valid_offset) + sub: "i64[s72][1]cuda:0" = l_input_ids_ - valid_offset; l_input_ids_ = valid_offset = None + input_: "i64[s72][1]cuda:0" = vocab_mask * sub; sub = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:181 in get_masked_input_and_mask, code: return input_, ~vocab_mask + input_mask: "b8[s72][1]cuda:0" = ~vocab_mask; vocab_mask = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:475 in forward_native, code: output_parallel = self.quant_method.embedding(self, masked_input.long()) + long: "i64[s72][1]cuda:0" = input_.long(); input_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:72 in embedding, code: return F.embedding(input_, layer.weight) + output_parallel: "bf16[s72, 4096][4096, 1]cuda:0" = torch.nn.functional.embedding(long, l_self_modules_embed_tokens_parameters_weight_); long = l_self_modules_embed_tokens_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:478 in forward_native, code: output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) + unsqueeze: "b8[s72, 1][1, 1]cuda:0" = input_mask.unsqueeze(-1); input_mask = None + masked_fill_: "bf16[s72, 4096][4096, 1]cuda:0" = output_parallel.masked_fill_(unsqueeze, 0); unsqueeze = masked_fill_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel, group_name = 'tp:0'); output_parallel = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x: "f32[s72, 4096][4096, 1]cuda:0" = output.to(torch.float32) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096][4096, 1]cuda:0" = x.pow(2) + variance: "f32[s72, 1][1, 1]cuda:0" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1][1, 1]cuda:0" = variance + 1e-05; variance = None + rsqrt: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_1); add_1 = None + x_1: "f32[s72, 4096][4096, 1]cuda:0" = x * rsqrt; x = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_2: "bf16[s72, 4096][4096, 1]cuda:0" = x_1.to(torch.bfloat16); x_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_3: "bf16[s72, 4096][4096, 1]cuda:0" = x_2 * _get_data_attr; x_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_1: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_3, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_3 = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = output_parallel_1.split([2048, 512, 512], dim = -1); output_parallel_1 = None + q: "bf16[s72, 2048][3072, 1]cuda:0" = split[0] + k: "bf16[s72, 512][3072, 1]cuda:0" = split[1] + v: "bf16[s72, 512][3072, 1]cuda:0" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions); positions = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = cos_sin.chunk(2, dim = -1); cos_sin = None + cos: "bf16[s72, 64][128, 1]cuda:0" = chunk[0] + sin: "bf16[s72, 64][128, 1]cuda:0" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_2 = q.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q.view(s72, -1, 128); q = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query[(Ellipsis, slice(128, None, None))]; query = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos.unsqueeze(-2) + cos_1: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin.unsqueeze(-2) + sin_1: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(query_rot, 2, dim = -1); query_rot = None + x1: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_1[0] + x2: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1 * cos_1 + mul_6: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2 * sin_1 + o1: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2 * cos_1; x2 = cos_1 = None + mul_8: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1 * sin_1; x1 = sin_1 = None + o2: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_1: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1, o2), dim = -1); o1 = o2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_1, query_pass), dim = -1); output_1 = query_pass = None + query_1: "bf16[s72, 2048][2048, 1]cuda:0" = cat_1.reshape(size_2); cat_1 = size_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_3 = k.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k.view(s72, -1, 128); k = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key[(Ellipsis, slice(128, None, None))]; key = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos.unsqueeze(-2); cos = None + cos_2: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_4: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin.unsqueeze(-2); sin = None + sin_2: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_4.to(torch.bfloat16); unsqueeze_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(key_rot, 2, dim = -1); key_rot = None + x1_1: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_2[0] + x2_1: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_1 * cos_2 + mul_10: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_1 * sin_2 + o1_1: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_1 * cos_2; x2_1 = cos_2 = None + mul_12: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_1 * sin_2; x1_1 = sin_2 = None + o2_1: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_2: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_1, o2_1), dim = -1); o1_1 = o2_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_2, key_pass), dim = -1); output_2 = key_pass = None + key_1: "bf16[s72, 512][512, 1]cuda:0" = cat_3.reshape(size_3); cat_3 = size_3 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_4 = query_1.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_3: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_4, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_4 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_2: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_1.view(-1, 16, 128); query_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_4: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_3.view(-1, 16, 128); output_3 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_2: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_1.view(-1, 4, 128); key_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v.view(-1, 4, 128); v = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_2, key_2, value, output_4, 'model.layers.0.self_attn.attn'); query_2 = key_2 = value = unified_attention_with_output = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output: "bf16[s72, 2048][2048, 1]cuda:0" = output_4.view(-1, 2048); output_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_2: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output, l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_5: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_2, group_name = 'tp:0'); output_parallel_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_4: "f32[s72, 4096][4096, 1]cuda:0" = output_5.to(torch.float32); output_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_5: "f32[s72, 4096][4096, 1]cuda:0" = x_4 + output; x_4 = output = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual: "bf16[s72, 4096][4096, 1]cuda:0" = x_5.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096][4096, 1]cuda:0" = x_5.pow(2) + variance_1: "f32[s72, 1][1, 1]cuda:0" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_5: "f32[s72, 1][1, 1]cuda:0" = variance_1 + 1e-05; variance_1 = None + rsqrt_1: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_5); add_5 = None + x_6: "f32[s72, 4096][4096, 1]cuda:0" = x_5 * rsqrt_1; x_5 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_7: "bf16[s72, 4096][4096, 1]cuda:0" = x_6.to(torch.bfloat16); x_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_8: "bf16[s72, 4096][4096, 1]cuda:0" = x_7 * _get_data_attr_1; x_7 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_3: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_8, l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_8 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_26: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_3[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_26); getitem_26 = None + getitem_27: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_3[(Ellipsis, slice(7168, None, None))]; output_parallel_3 = None + x_9: "bf16[s72, 7168][7168, 1]cuda:0" = silu * getitem_27; silu = getitem_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_4: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_9, l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, None); x_9 = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_6: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_4, group_name = 'tp:0'); output_parallel_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_2: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_10: "f32[s72, 4096][4096, 1]cuda:0" = output_6.to(torch.float32); output_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_11: "f32[s72, 4096][4096, 1]cuda:0" = x_10 + residual; x_10 = residual = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_1: "bf16[s72, 4096][4096, 1]cuda:0" = x_11.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_3: "f32[s72, 4096][4096, 1]cuda:0" = x_11.pow(2) + variance_2: "f32[s72, 1][1, 1]cuda:0" = pow_3.mean(dim = -1, keepdim = True); pow_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_7: "f32[s72, 1][1, 1]cuda:0" = variance_2 + 1e-05; variance_2 = None + rsqrt_2: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_7); add_7 = None + x_12: "f32[s72, 4096][4096, 1]cuda:0" = x_11 * rsqrt_2; x_11 = rsqrt_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_13: "bf16[s72, 4096][4096, 1]cuda:0" = x_12.to(torch.bfloat16); x_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_14: "bf16[s72, 4096][4096, 1]cuda:0" = x_13 * _get_data_attr_2; x_13 = _get_data_attr_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_5: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_14, l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_14 = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_1 = output_parallel_5.split([2048, 512, 512], dim = -1); output_parallel_5 = None + q_1: "bf16[s72, 2048][3072, 1]cuda:0" = split_1[0] + k_1: "bf16[s72, 512][3072, 1]cuda:0" = split_1[1] + v_1: "bf16[s72, 512][3072, 1]cuda:0" = split_1[2]; split_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_1: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_1: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_1); positions_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_3 = cos_sin_1.chunk(2, dim = -1); cos_sin_1 = None + cos_3: "bf16[s72, 64][128, 1]cuda:0" = chunk_3[0] + sin_3: "bf16[s72, 64][128, 1]cuda:0" = chunk_3[1]; chunk_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_9 = q_1.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_3: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_1.view(s72, -1, 128); q_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_1: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_3[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_1: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_3[(Ellipsis, slice(128, None, None))]; query_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_5: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_3.unsqueeze(-2) + cos_4: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_5.to(torch.bfloat16); unsqueeze_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_6: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_3.unsqueeze(-2) + sin_4: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_6.to(torch.bfloat16); unsqueeze_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_4 = torch.chunk(query_rot_1, 2, dim = -1); query_rot_1 = None + x1_2: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_4[0] + x2_2: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_4[1]; chunk_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_18: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_2 * cos_4 + mul_19: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_2 * sin_4 + o1_2: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_18 - mul_19; mul_18 = mul_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_20: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_2 * cos_4; x2_2 = cos_4 = None + mul_21: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_2 * sin_4; x1_2 = sin_4 = None + o2_2: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_20 + mul_21; mul_20 = mul_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_7: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_2, o2_2), dim = -1); o1_2 = o2_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_5: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_7, query_pass_1), dim = -1); output_7 = query_pass_1 = None + query_4: "bf16[s72, 2048][2048, 1]cuda:0" = cat_5.reshape(size_9); cat_5 = size_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_10 = k_1.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_3: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_1.view(s72, -1, 128); k_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_1: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_3[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_1: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_3[(Ellipsis, slice(128, None, None))]; key_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_7: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_3.unsqueeze(-2); cos_3 = None + cos_5: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_7.to(torch.bfloat16); unsqueeze_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_8: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_3.unsqueeze(-2); sin_3 = None + sin_5: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_8.to(torch.bfloat16); unsqueeze_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_5 = torch.chunk(key_rot_1, 2, dim = -1); key_rot_1 = None + x1_3: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_5[0] + x2_3: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_5[1]; chunk_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_22: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_3 * cos_5 + mul_23: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_3 * sin_5 + o1_3: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_22 - mul_23; mul_22 = mul_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_24: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_3 * cos_5; x2_3 = cos_5 = None + mul_25: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_3 * sin_5; x1_3 = sin_5 = None + o2_3: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_24 + mul_25; mul_24 = mul_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_8: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_3, o2_3), dim = -1); o1_3 = o2_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_7: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_8, key_pass_1), dim = -1); output_8 = key_pass_1 = None + key_4: "bf16[s72, 512][512, 1]cuda:0" = cat_7.reshape(size_10); cat_7 = size_10 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_11 = query_4.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_9: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_11, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_11 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_5: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_4.view(-1, 16, 128); query_4 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_10: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_9.view(-1, 16, 128); output_9 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_5: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_4.view(-1, 4, 128); key_4 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_1: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_1.view(-1, 4, 128); v_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_1 = torch.ops.vllm.unified_attention_with_output(query_5, key_5, value_1, output_10, 'model.layers.1.self_attn.attn'); query_5 = key_5 = value_1 = unified_attention_with_output_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_1: "bf16[s72, 2048][2048, 1]cuda:0" = output_10.view(-1, 2048); output_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_6: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_1, l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_1 = l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_11: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_6, group_name = 'tp:0'); output_parallel_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_3: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_15: "f32[s72, 4096][4096, 1]cuda:0" = output_11.to(torch.float32); output_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_16: "f32[s72, 4096][4096, 1]cuda:0" = x_15 + residual_1; x_15 = residual_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_2: "bf16[s72, 4096][4096, 1]cuda:0" = x_16.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_4: "f32[s72, 4096][4096, 1]cuda:0" = x_16.pow(2) + variance_3: "f32[s72, 1][1, 1]cuda:0" = pow_4.mean(dim = -1, keepdim = True); pow_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_11: "f32[s72, 1][1, 1]cuda:0" = variance_3 + 1e-05; variance_3 = None + rsqrt_3: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_11); add_11 = None + x_17: "f32[s72, 4096][4096, 1]cuda:0" = x_16 * rsqrt_3; x_16 = rsqrt_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_18: "bf16[s72, 4096][4096, 1]cuda:0" = x_17.to(torch.bfloat16); x_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_19: "bf16[s72, 4096][4096, 1]cuda:0" = x_18 * _get_data_attr_3; x_18 = _get_data_attr_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_7: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_19, l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_19 = l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_54: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_7[(Ellipsis, slice(None, 7168, None))] + silu_1: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_54); getitem_54 = None + getitem_55: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_7[(Ellipsis, slice(7168, None, None))]; output_parallel_7 = None + x_20: "bf16[s72, 7168][7168, 1]cuda:0" = silu_1 * getitem_55; silu_1 = getitem_55 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_8: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_20, l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, None); x_20 = l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_12: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_8, group_name = 'tp:0'); output_parallel_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_4: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_21: "f32[s72, 4096][4096, 1]cuda:0" = output_12.to(torch.float32); output_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_22: "f32[s72, 4096][4096, 1]cuda:0" = x_21 + residual_2; x_21 = residual_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_3: "bf16[s72, 4096][4096, 1]cuda:0" = x_22.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_5: "f32[s72, 4096][4096, 1]cuda:0" = x_22.pow(2) + variance_4: "f32[s72, 1][1, 1]cuda:0" = pow_5.mean(dim = -1, keepdim = True); pow_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_13: "f32[s72, 1][1, 1]cuda:0" = variance_4 + 1e-05; variance_4 = None + rsqrt_4: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_13); add_13 = None + x_23: "f32[s72, 4096][4096, 1]cuda:0" = x_22 * rsqrt_4; x_22 = rsqrt_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_24: "bf16[s72, 4096][4096, 1]cuda:0" = x_23.to(torch.bfloat16); x_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_25: "bf16[s72, 4096][4096, 1]cuda:0" = x_24 * _get_data_attr_4; x_24 = _get_data_attr_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_9: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_25, l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_25 = l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_2 = output_parallel_9.split([2048, 512, 512], dim = -1); output_parallel_9 = None + q_2: "bf16[s72, 2048][3072, 1]cuda:0" = split_2[0] + k_2: "bf16[s72, 512][3072, 1]cuda:0" = split_2[1] + v_2: "bf16[s72, 512][3072, 1]cuda:0" = split_2[2]; split_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_2: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_2: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_2); positions_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_6 = cos_sin_2.chunk(2, dim = -1); cos_sin_2 = None + cos_6: "bf16[s72, 64][128, 1]cuda:0" = chunk_6[0] + sin_6: "bf16[s72, 64][128, 1]cuda:0" = chunk_6[1]; chunk_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_16 = q_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_6: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_2.view(s72, -1, 128); q_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_2: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_6[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_2: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_6[(Ellipsis, slice(128, None, None))]; query_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_9: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_6.unsqueeze(-2) + cos_7: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_9.to(torch.bfloat16); unsqueeze_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_10: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_6.unsqueeze(-2) + sin_7: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_10.to(torch.bfloat16); unsqueeze_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_7 = torch.chunk(query_rot_2, 2, dim = -1); query_rot_2 = None + x1_4: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_7[0] + x2_4: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_7[1]; chunk_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_31: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_4 * cos_7 + mul_32: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_4 * sin_7 + o1_4: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_31 - mul_32; mul_31 = mul_32 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_33: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_4 * cos_7; x2_4 = cos_7 = None + mul_34: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_4 * sin_7; x1_4 = sin_7 = None + o2_4: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_33 + mul_34; mul_33 = mul_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_13: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_4, o2_4), dim = -1); o1_4 = o2_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_9: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_13, query_pass_2), dim = -1); output_13 = query_pass_2 = None + query_7: "bf16[s72, 2048][2048, 1]cuda:0" = cat_9.reshape(size_16); cat_9 = size_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_17 = k_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_6: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_2.view(s72, -1, 128); k_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_2: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_6[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_2: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_6[(Ellipsis, slice(128, None, None))]; key_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_11: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_6.unsqueeze(-2); cos_6 = None + cos_8: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_11.to(torch.bfloat16); unsqueeze_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_12: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_6.unsqueeze(-2); sin_6 = None + sin_8: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_12.to(torch.bfloat16); unsqueeze_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_8 = torch.chunk(key_rot_2, 2, dim = -1); key_rot_2 = None + x1_5: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_8[0] + x2_5: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_8[1]; chunk_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_35: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_5 * cos_8 + mul_36: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_5 * sin_8 + o1_5: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_35 - mul_36; mul_35 = mul_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_37: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_5 * cos_8; x2_5 = cos_8 = None + mul_38: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_5 * sin_8; x1_5 = sin_8 = None + o2_5: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_37 + mul_38; mul_37 = mul_38 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_14: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_5, o2_5), dim = -1); o1_5 = o2_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_11: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_14, key_pass_2), dim = -1); output_14 = key_pass_2 = None + key_7: "bf16[s72, 512][512, 1]cuda:0" = cat_11.reshape(size_17); cat_11 = size_17 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_18 = query_7.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_15: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_18, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_18 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_8: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_7.view(-1, 16, 128); query_7 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_16: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_15.view(-1, 16, 128); output_15 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_8: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_7.view(-1, 4, 128); key_7 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_2: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_2.view(-1, 4, 128); v_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_2 = torch.ops.vllm.unified_attention_with_output(query_8, key_8, value_2, output_16, 'model.layers.2.self_attn.attn'); query_8 = key_8 = value_2 = unified_attention_with_output_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_2: "bf16[s72, 2048][2048, 1]cuda:0" = output_16.view(-1, 2048); output_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_10: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_2, l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_2 = l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_17: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_10, group_name = 'tp:0'); output_parallel_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_5: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_26: "f32[s72, 4096][4096, 1]cuda:0" = output_17.to(torch.float32); output_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_27: "f32[s72, 4096][4096, 1]cuda:0" = x_26 + residual_3; x_26 = residual_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_4: "bf16[s72, 4096][4096, 1]cuda:0" = x_27.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_6: "f32[s72, 4096][4096, 1]cuda:0" = x_27.pow(2) + variance_5: "f32[s72, 1][1, 1]cuda:0" = pow_6.mean(dim = -1, keepdim = True); pow_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_17: "f32[s72, 1][1, 1]cuda:0" = variance_5 + 1e-05; variance_5 = None + rsqrt_5: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_17); add_17 = None + x_28: "f32[s72, 4096][4096, 1]cuda:0" = x_27 * rsqrt_5; x_27 = rsqrt_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_29: "bf16[s72, 4096][4096, 1]cuda:0" = x_28.to(torch.bfloat16); x_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_30: "bf16[s72, 4096][4096, 1]cuda:0" = x_29 * _get_data_attr_5; x_29 = _get_data_attr_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_11: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_30, l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_30 = l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_82: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_11[(Ellipsis, slice(None, 7168, None))] + silu_2: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_82); getitem_82 = None + getitem_83: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_11[(Ellipsis, slice(7168, None, None))]; output_parallel_11 = None + x_31: "bf16[s72, 7168][7168, 1]cuda:0" = silu_2 * getitem_83; silu_2 = getitem_83 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_12: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_31, l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, None); x_31 = l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_18: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_12, group_name = 'tp:0'); output_parallel_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_6: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_32: "f32[s72, 4096][4096, 1]cuda:0" = output_18.to(torch.float32); output_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_33: "f32[s72, 4096][4096, 1]cuda:0" = x_32 + residual_4; x_32 = residual_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_5: "bf16[s72, 4096][4096, 1]cuda:0" = x_33.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_7: "f32[s72, 4096][4096, 1]cuda:0" = x_33.pow(2) + variance_6: "f32[s72, 1][1, 1]cuda:0" = pow_7.mean(dim = -1, keepdim = True); pow_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_19: "f32[s72, 1][1, 1]cuda:0" = variance_6 + 1e-05; variance_6 = None + rsqrt_6: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_19); add_19 = None + x_34: "f32[s72, 4096][4096, 1]cuda:0" = x_33 * rsqrt_6; x_33 = rsqrt_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_35: "bf16[s72, 4096][4096, 1]cuda:0" = x_34.to(torch.bfloat16); x_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_36: "bf16[s72, 4096][4096, 1]cuda:0" = x_35 * _get_data_attr_6; x_35 = _get_data_attr_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_13: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_36, l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_36 = l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_3 = output_parallel_13.split([2048, 512, 512], dim = -1); output_parallel_13 = None + q_3: "bf16[s72, 2048][3072, 1]cuda:0" = split_3[0] + k_3: "bf16[s72, 512][3072, 1]cuda:0" = split_3[1] + v_3: "bf16[s72, 512][3072, 1]cuda:0" = split_3[2]; split_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_3: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_3: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_3); positions_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_9 = cos_sin_3.chunk(2, dim = -1); cos_sin_3 = None + cos_9: "bf16[s72, 64][128, 1]cuda:0" = chunk_9[0] + sin_9: "bf16[s72, 64][128, 1]cuda:0" = chunk_9[1]; chunk_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_23 = q_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_9: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_3.view(s72, -1, 128); q_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_3: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_9[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_3: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_9[(Ellipsis, slice(128, None, None))]; query_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_13: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_9.unsqueeze(-2) + cos_10: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_13.to(torch.bfloat16); unsqueeze_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_14: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_9.unsqueeze(-2) + sin_10: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_14.to(torch.bfloat16); unsqueeze_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_10 = torch.chunk(query_rot_3, 2, dim = -1); query_rot_3 = None + x1_6: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_10[0] + x2_6: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_10[1]; chunk_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_44: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_6 * cos_10 + mul_45: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_6 * sin_10 + o1_6: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_44 - mul_45; mul_44 = mul_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_46: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_6 * cos_10; x2_6 = cos_10 = None + mul_47: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_6 * sin_10; x1_6 = sin_10 = None + o2_6: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_46 + mul_47; mul_46 = mul_47 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_19: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_6, o2_6), dim = -1); o1_6 = o2_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_13: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_19, query_pass_3), dim = -1); output_19 = query_pass_3 = None + query_10: "bf16[s72, 2048][2048, 1]cuda:0" = cat_13.reshape(size_23); cat_13 = size_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_24 = k_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_9: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_3.view(s72, -1, 128); k_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_3: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_9[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_3: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_9[(Ellipsis, slice(128, None, None))]; key_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_15: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_9.unsqueeze(-2); cos_9 = None + cos_11: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_15.to(torch.bfloat16); unsqueeze_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_16: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_9.unsqueeze(-2); sin_9 = None + sin_11: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_16.to(torch.bfloat16); unsqueeze_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_11 = torch.chunk(key_rot_3, 2, dim = -1); key_rot_3 = None + x1_7: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_11[0] + x2_7: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_11[1]; chunk_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_48: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_7 * cos_11 + mul_49: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_7 * sin_11 + o1_7: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_48 - mul_49; mul_48 = mul_49 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_50: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_7 * cos_11; x2_7 = cos_11 = None + mul_51: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_7 * sin_11; x1_7 = sin_11 = None + o2_7: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_50 + mul_51; mul_50 = mul_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_20: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_7, o2_7), dim = -1); o1_7 = o2_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_15: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_20, key_pass_3), dim = -1); output_20 = key_pass_3 = None + key_10: "bf16[s72, 512][512, 1]cuda:0" = cat_15.reshape(size_24); cat_15 = size_24 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_25 = query_10.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_21: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_25, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_25 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_11: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_10.view(-1, 16, 128); query_10 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_22: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_21.view(-1, 16, 128); output_21 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_11: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_10.view(-1, 4, 128); key_10 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_3: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_3.view(-1, 4, 128); v_3 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_3 = torch.ops.vllm.unified_attention_with_output(query_11, key_11, value_3, output_22, 'model.layers.3.self_attn.attn'); query_11 = key_11 = value_3 = unified_attention_with_output_3 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_3: "bf16[s72, 2048][2048, 1]cuda:0" = output_22.view(-1, 2048); output_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_14: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_3, l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_3 = l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_23: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_14, group_name = 'tp:0'); output_parallel_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_7: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_37: "f32[s72, 4096][4096, 1]cuda:0" = output_23.to(torch.float32); output_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_38: "f32[s72, 4096][4096, 1]cuda:0" = x_37 + residual_5; x_37 = residual_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_6: "bf16[s72, 4096][4096, 1]cuda:0" = x_38.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_8: "f32[s72, 4096][4096, 1]cuda:0" = x_38.pow(2) + variance_7: "f32[s72, 1][1, 1]cuda:0" = pow_8.mean(dim = -1, keepdim = True); pow_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_23: "f32[s72, 1][1, 1]cuda:0" = variance_7 + 1e-05; variance_7 = None + rsqrt_7: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_23); add_23 = None + x_39: "f32[s72, 4096][4096, 1]cuda:0" = x_38 * rsqrt_7; x_38 = rsqrt_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_40: "bf16[s72, 4096][4096, 1]cuda:0" = x_39.to(torch.bfloat16); x_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_41: "bf16[s72, 4096][4096, 1]cuda:0" = x_40 * _get_data_attr_7; x_40 = _get_data_attr_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_15: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_41, l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_41 = l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_110: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_15[(Ellipsis, slice(None, 7168, None))] + silu_3: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_110); getitem_110 = None + getitem_111: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_15[(Ellipsis, slice(7168, None, None))]; output_parallel_15 = None + x_42: "bf16[s72, 7168][7168, 1]cuda:0" = silu_3 * getitem_111; silu_3 = getitem_111 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_16: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_42, l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, None); x_42 = l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_24: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_16, group_name = 'tp:0'); output_parallel_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_8: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_43: "f32[s72, 4096][4096, 1]cuda:0" = output_24.to(torch.float32); output_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_44: "f32[s72, 4096][4096, 1]cuda:0" = x_43 + residual_6; x_43 = residual_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_7: "bf16[s72, 4096][4096, 1]cuda:0" = x_44.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_9: "f32[s72, 4096][4096, 1]cuda:0" = x_44.pow(2) + variance_8: "f32[s72, 1][1, 1]cuda:0" = pow_9.mean(dim = -1, keepdim = True); pow_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_25: "f32[s72, 1][1, 1]cuda:0" = variance_8 + 1e-05; variance_8 = None + rsqrt_8: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_25); add_25 = None + x_45: "f32[s72, 4096][4096, 1]cuda:0" = x_44 * rsqrt_8; x_44 = rsqrt_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_46: "bf16[s72, 4096][4096, 1]cuda:0" = x_45.to(torch.bfloat16); x_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_47: "bf16[s72, 4096][4096, 1]cuda:0" = x_46 * _get_data_attr_8; x_46 = _get_data_attr_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_17: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_47, l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_47 = l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_4 = output_parallel_17.split([2048, 512, 512], dim = -1); output_parallel_17 = None + q_4: "bf16[s72, 2048][3072, 1]cuda:0" = split_4[0] + k_4: "bf16[s72, 512][3072, 1]cuda:0" = split_4[1] + v_4: "bf16[s72, 512][3072, 1]cuda:0" = split_4[2]; split_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_4: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_4: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_4); positions_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_12 = cos_sin_4.chunk(2, dim = -1); cos_sin_4 = None + cos_12: "bf16[s72, 64][128, 1]cuda:0" = chunk_12[0] + sin_12: "bf16[s72, 64][128, 1]cuda:0" = chunk_12[1]; chunk_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_30 = q_4.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_12: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_4.view(s72, -1, 128); q_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_4: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_12[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_4: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_12[(Ellipsis, slice(128, None, None))]; query_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_17: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_12.unsqueeze(-2) + cos_13: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_17.to(torch.bfloat16); unsqueeze_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_18: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_12.unsqueeze(-2) + sin_13: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_18.to(torch.bfloat16); unsqueeze_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_13 = torch.chunk(query_rot_4, 2, dim = -1); query_rot_4 = None + x1_8: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_13[0] + x2_8: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_13[1]; chunk_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_57: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_8 * cos_13 + mul_58: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_8 * sin_13 + o1_8: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_57 - mul_58; mul_57 = mul_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_59: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_8 * cos_13; x2_8 = cos_13 = None + mul_60: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_8 * sin_13; x1_8 = sin_13 = None + o2_8: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_59 + mul_60; mul_59 = mul_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_25: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_8, o2_8), dim = -1); o1_8 = o2_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_17: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_25, query_pass_4), dim = -1); output_25 = query_pass_4 = None + query_13: "bf16[s72, 2048][2048, 1]cuda:0" = cat_17.reshape(size_30); cat_17 = size_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_31 = k_4.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_12: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_4.view(s72, -1, 128); k_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_4: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_12[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_4: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_12[(Ellipsis, slice(128, None, None))]; key_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_19: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_12.unsqueeze(-2); cos_12 = None + cos_14: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_19.to(torch.bfloat16); unsqueeze_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_20: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_12.unsqueeze(-2); sin_12 = None + sin_14: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_20.to(torch.bfloat16); unsqueeze_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_14 = torch.chunk(key_rot_4, 2, dim = -1); key_rot_4 = None + x1_9: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_14[0] + x2_9: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_14[1]; chunk_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_61: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_9 * cos_14 + mul_62: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_9 * sin_14 + o1_9: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_61 - mul_62; mul_61 = mul_62 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_63: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_9 * cos_14; x2_9 = cos_14 = None + mul_64: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_9 * sin_14; x1_9 = sin_14 = None + o2_9: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_63 + mul_64; mul_63 = mul_64 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_26: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_9, o2_9), dim = -1); o1_9 = o2_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_19: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_26, key_pass_4), dim = -1); output_26 = key_pass_4 = None + key_13: "bf16[s72, 512][512, 1]cuda:0" = cat_19.reshape(size_31); cat_19 = size_31 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_32 = query_13.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_27: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_32, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_32 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_14: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_13.view(-1, 16, 128); query_13 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_28: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_27.view(-1, 16, 128); output_27 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_14: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_13.view(-1, 4, 128); key_13 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_4: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_4.view(-1, 4, 128); v_4 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_4 = torch.ops.vllm.unified_attention_with_output(query_14, key_14, value_4, output_28, 'model.layers.4.self_attn.attn'); query_14 = key_14 = value_4 = unified_attention_with_output_4 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_4: "bf16[s72, 2048][2048, 1]cuda:0" = output_28.view(-1, 2048); output_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_18: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_4, l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_4 = l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_29: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_18, group_name = 'tp:0'); output_parallel_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_9: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_48: "f32[s72, 4096][4096, 1]cuda:0" = output_29.to(torch.float32); output_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_49: "f32[s72, 4096][4096, 1]cuda:0" = x_48 + residual_7; x_48 = residual_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_8: "bf16[s72, 4096][4096, 1]cuda:0" = x_49.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_10: "f32[s72, 4096][4096, 1]cuda:0" = x_49.pow(2) + variance_9: "f32[s72, 1][1, 1]cuda:0" = pow_10.mean(dim = -1, keepdim = True); pow_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_29: "f32[s72, 1][1, 1]cuda:0" = variance_9 + 1e-05; variance_9 = None + rsqrt_9: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_29); add_29 = None + x_50: "f32[s72, 4096][4096, 1]cuda:0" = x_49 * rsqrt_9; x_49 = rsqrt_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_51: "bf16[s72, 4096][4096, 1]cuda:0" = x_50.to(torch.bfloat16); x_50 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_52: "bf16[s72, 4096][4096, 1]cuda:0" = x_51 * _get_data_attr_9; x_51 = _get_data_attr_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_19: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_52, l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_52 = l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_138: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_19[(Ellipsis, slice(None, 7168, None))] + silu_4: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_138); getitem_138 = None + getitem_139: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_19[(Ellipsis, slice(7168, None, None))]; output_parallel_19 = None + x_53: "bf16[s72, 7168][7168, 1]cuda:0" = silu_4 * getitem_139; silu_4 = getitem_139 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_20: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_53, l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, None); x_53 = l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_30: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_20, group_name = 'tp:0'); output_parallel_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_10: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_54: "f32[s72, 4096][4096, 1]cuda:0" = output_30.to(torch.float32); output_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_55: "f32[s72, 4096][4096, 1]cuda:0" = x_54 + residual_8; x_54 = residual_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_9: "bf16[s72, 4096][4096, 1]cuda:0" = x_55.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_11: "f32[s72, 4096][4096, 1]cuda:0" = x_55.pow(2) + variance_10: "f32[s72, 1][1, 1]cuda:0" = pow_11.mean(dim = -1, keepdim = True); pow_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_31: "f32[s72, 1][1, 1]cuda:0" = variance_10 + 1e-05; variance_10 = None + rsqrt_10: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_31); add_31 = None + x_56: "f32[s72, 4096][4096, 1]cuda:0" = x_55 * rsqrt_10; x_55 = rsqrt_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_57: "bf16[s72, 4096][4096, 1]cuda:0" = x_56.to(torch.bfloat16); x_56 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_58: "bf16[s72, 4096][4096, 1]cuda:0" = x_57 * _get_data_attr_10; x_57 = _get_data_attr_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_21: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_58, l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_58 = l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_5 = output_parallel_21.split([2048, 512, 512], dim = -1); output_parallel_21 = None + q_5: "bf16[s72, 2048][3072, 1]cuda:0" = split_5[0] + k_5: "bf16[s72, 512][3072, 1]cuda:0" = split_5[1] + v_5: "bf16[s72, 512][3072, 1]cuda:0" = split_5[2]; split_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_5: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_5: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_5); positions_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_15 = cos_sin_5.chunk(2, dim = -1); cos_sin_5 = None + cos_15: "bf16[s72, 64][128, 1]cuda:0" = chunk_15[0] + sin_15: "bf16[s72, 64][128, 1]cuda:0" = chunk_15[1]; chunk_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_37 = q_5.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_15: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_5.view(s72, -1, 128); q_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_5: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_15[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_5: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_15[(Ellipsis, slice(128, None, None))]; query_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_21: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_15.unsqueeze(-2) + cos_16: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_21.to(torch.bfloat16); unsqueeze_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_22: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_15.unsqueeze(-2) + sin_16: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_22.to(torch.bfloat16); unsqueeze_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_16 = torch.chunk(query_rot_5, 2, dim = -1); query_rot_5 = None + x1_10: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_16[0] + x2_10: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_16[1]; chunk_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_70: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_10 * cos_16 + mul_71: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_10 * sin_16 + o1_10: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_70 - mul_71; mul_70 = mul_71 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_72: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_10 * cos_16; x2_10 = cos_16 = None + mul_73: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_10 * sin_16; x1_10 = sin_16 = None + o2_10: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_72 + mul_73; mul_72 = mul_73 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_31: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_10, o2_10), dim = -1); o1_10 = o2_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_21: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_31, query_pass_5), dim = -1); output_31 = query_pass_5 = None + query_16: "bf16[s72, 2048][2048, 1]cuda:0" = cat_21.reshape(size_37); cat_21 = size_37 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_38 = k_5.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_15: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_5.view(s72, -1, 128); k_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_5: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_15[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_5: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_15[(Ellipsis, slice(128, None, None))]; key_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_23: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_15.unsqueeze(-2); cos_15 = None + cos_17: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_23.to(torch.bfloat16); unsqueeze_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_24: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_15.unsqueeze(-2); sin_15 = None + sin_17: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_24.to(torch.bfloat16); unsqueeze_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_17 = torch.chunk(key_rot_5, 2, dim = -1); key_rot_5 = None + x1_11: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_17[0] + x2_11: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_17[1]; chunk_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_74: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_11 * cos_17 + mul_75: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_11 * sin_17 + o1_11: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_74 - mul_75; mul_74 = mul_75 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_76: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_11 * cos_17; x2_11 = cos_17 = None + mul_77: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_11 * sin_17; x1_11 = sin_17 = None + o2_11: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_76 + mul_77; mul_76 = mul_77 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_32: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_11, o2_11), dim = -1); o1_11 = o2_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_23: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_32, key_pass_5), dim = -1); output_32 = key_pass_5 = None + key_16: "bf16[s72, 512][512, 1]cuda:0" = cat_23.reshape(size_38); cat_23 = size_38 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_39 = query_16.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_33: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_39, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_39 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_17: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_16.view(-1, 16, 128); query_16 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_34: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_33.view(-1, 16, 128); output_33 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_17: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_16.view(-1, 4, 128); key_16 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_5: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_5.view(-1, 4, 128); v_5 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_5 = torch.ops.vllm.unified_attention_with_output(query_17, key_17, value_5, output_34, 'model.layers.5.self_attn.attn'); query_17 = key_17 = value_5 = unified_attention_with_output_5 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_5: "bf16[s72, 2048][2048, 1]cuda:0" = output_34.view(-1, 2048); output_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_22: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_5, l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_5 = l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_35: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_22, group_name = 'tp:0'); output_parallel_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_11: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_59: "f32[s72, 4096][4096, 1]cuda:0" = output_35.to(torch.float32); output_35 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_60: "f32[s72, 4096][4096, 1]cuda:0" = x_59 + residual_9; x_59 = residual_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_10: "bf16[s72, 4096][4096, 1]cuda:0" = x_60.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_12: "f32[s72, 4096][4096, 1]cuda:0" = x_60.pow(2) + variance_11: "f32[s72, 1][1, 1]cuda:0" = pow_12.mean(dim = -1, keepdim = True); pow_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_35: "f32[s72, 1][1, 1]cuda:0" = variance_11 + 1e-05; variance_11 = None + rsqrt_11: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_35); add_35 = None + x_61: "f32[s72, 4096][4096, 1]cuda:0" = x_60 * rsqrt_11; x_60 = rsqrt_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_62: "bf16[s72, 4096][4096, 1]cuda:0" = x_61.to(torch.bfloat16); x_61 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_63: "bf16[s72, 4096][4096, 1]cuda:0" = x_62 * _get_data_attr_11; x_62 = _get_data_attr_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_23: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_63, l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_63 = l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_166: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_23[(Ellipsis, slice(None, 7168, None))] + silu_5: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_166); getitem_166 = None + getitem_167: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_23[(Ellipsis, slice(7168, None, None))]; output_parallel_23 = None + x_64: "bf16[s72, 7168][7168, 1]cuda:0" = silu_5 * getitem_167; silu_5 = getitem_167 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_24: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_64, l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, None); x_64 = l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_36: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_24, group_name = 'tp:0'); output_parallel_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_12: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_65: "f32[s72, 4096][4096, 1]cuda:0" = output_36.to(torch.float32); output_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_66: "f32[s72, 4096][4096, 1]cuda:0" = x_65 + residual_10; x_65 = residual_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_11: "bf16[s72, 4096][4096, 1]cuda:0" = x_66.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_13: "f32[s72, 4096][4096, 1]cuda:0" = x_66.pow(2) + variance_12: "f32[s72, 1][1, 1]cuda:0" = pow_13.mean(dim = -1, keepdim = True); pow_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_37: "f32[s72, 1][1, 1]cuda:0" = variance_12 + 1e-05; variance_12 = None + rsqrt_12: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_37); add_37 = None + x_67: "f32[s72, 4096][4096, 1]cuda:0" = x_66 * rsqrt_12; x_66 = rsqrt_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_68: "bf16[s72, 4096][4096, 1]cuda:0" = x_67.to(torch.bfloat16); x_67 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_69: "bf16[s72, 4096][4096, 1]cuda:0" = x_68 * _get_data_attr_12; x_68 = _get_data_attr_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_25: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_69, l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_69 = l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_6 = output_parallel_25.split([2048, 512, 512], dim = -1); output_parallel_25 = None + q_6: "bf16[s72, 2048][3072, 1]cuda:0" = split_6[0] + k_6: "bf16[s72, 512][3072, 1]cuda:0" = split_6[1] + v_6: "bf16[s72, 512][3072, 1]cuda:0" = split_6[2]; split_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_6: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_6: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_6); positions_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_18 = cos_sin_6.chunk(2, dim = -1); cos_sin_6 = None + cos_18: "bf16[s72, 64][128, 1]cuda:0" = chunk_18[0] + sin_18: "bf16[s72, 64][128, 1]cuda:0" = chunk_18[1]; chunk_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_44 = q_6.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_18: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_6.view(s72, -1, 128); q_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_6: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_18[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_6: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_18[(Ellipsis, slice(128, None, None))]; query_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_25: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_18.unsqueeze(-2) + cos_19: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_25.to(torch.bfloat16); unsqueeze_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_26: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_18.unsqueeze(-2) + sin_19: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_26.to(torch.bfloat16); unsqueeze_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_19 = torch.chunk(query_rot_6, 2, dim = -1); query_rot_6 = None + x1_12: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_19[0] + x2_12: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_19[1]; chunk_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_83: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_12 * cos_19 + mul_84: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_12 * sin_19 + o1_12: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_83 - mul_84; mul_83 = mul_84 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_85: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_12 * cos_19; x2_12 = cos_19 = None + mul_86: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_12 * sin_19; x1_12 = sin_19 = None + o2_12: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_85 + mul_86; mul_85 = mul_86 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_37: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_12, o2_12), dim = -1); o1_12 = o2_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_25: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_37, query_pass_6), dim = -1); output_37 = query_pass_6 = None + query_19: "bf16[s72, 2048][2048, 1]cuda:0" = cat_25.reshape(size_44); cat_25 = size_44 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_45 = k_6.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_18: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_6.view(s72, -1, 128); k_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_6: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_18[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_6: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_18[(Ellipsis, slice(128, None, None))]; key_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_27: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_18.unsqueeze(-2); cos_18 = None + cos_20: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_27.to(torch.bfloat16); unsqueeze_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_28: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_18.unsqueeze(-2); sin_18 = None + sin_20: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_28.to(torch.bfloat16); unsqueeze_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_20 = torch.chunk(key_rot_6, 2, dim = -1); key_rot_6 = None + x1_13: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_20[0] + x2_13: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_20[1]; chunk_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_87: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_13 * cos_20 + mul_88: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_13 * sin_20 + o1_13: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_87 - mul_88; mul_87 = mul_88 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_89: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_13 * cos_20; x2_13 = cos_20 = None + mul_90: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_13 * sin_20; x1_13 = sin_20 = None + o2_13: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_89 + mul_90; mul_89 = mul_90 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_38: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_13, o2_13), dim = -1); o1_13 = o2_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_27: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_38, key_pass_6), dim = -1); output_38 = key_pass_6 = None + key_19: "bf16[s72, 512][512, 1]cuda:0" = cat_27.reshape(size_45); cat_27 = size_45 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_46 = query_19.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_39: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_46, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_46 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_20: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_19.view(-1, 16, 128); query_19 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_40: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_39.view(-1, 16, 128); output_39 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_20: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_19.view(-1, 4, 128); key_19 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_6: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_6.view(-1, 4, 128); v_6 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_6 = torch.ops.vllm.unified_attention_with_output(query_20, key_20, value_6, output_40, 'model.layers.6.self_attn.attn'); query_20 = key_20 = value_6 = unified_attention_with_output_6 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_6: "bf16[s72, 2048][2048, 1]cuda:0" = output_40.view(-1, 2048); output_40 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_26: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_6, l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_6 = l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_41: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_26, group_name = 'tp:0'); output_parallel_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_13: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_70: "f32[s72, 4096][4096, 1]cuda:0" = output_41.to(torch.float32); output_41 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_71: "f32[s72, 4096][4096, 1]cuda:0" = x_70 + residual_11; x_70 = residual_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_12: "bf16[s72, 4096][4096, 1]cuda:0" = x_71.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_14: "f32[s72, 4096][4096, 1]cuda:0" = x_71.pow(2) + variance_13: "f32[s72, 1][1, 1]cuda:0" = pow_14.mean(dim = -1, keepdim = True); pow_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_41: "f32[s72, 1][1, 1]cuda:0" = variance_13 + 1e-05; variance_13 = None + rsqrt_13: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_41); add_41 = None + x_72: "f32[s72, 4096][4096, 1]cuda:0" = x_71 * rsqrt_13; x_71 = rsqrt_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_73: "bf16[s72, 4096][4096, 1]cuda:0" = x_72.to(torch.bfloat16); x_72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_74: "bf16[s72, 4096][4096, 1]cuda:0" = x_73 * _get_data_attr_13; x_73 = _get_data_attr_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_27: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_74, l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_74 = l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_194: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_27[(Ellipsis, slice(None, 7168, None))] + silu_6: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_194); getitem_194 = None + getitem_195: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_27[(Ellipsis, slice(7168, None, None))]; output_parallel_27 = None + x_75: "bf16[s72, 7168][7168, 1]cuda:0" = silu_6 * getitem_195; silu_6 = getitem_195 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_28: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_75, l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, None); x_75 = l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_42: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_28, group_name = 'tp:0'); output_parallel_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_14: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_76: "f32[s72, 4096][4096, 1]cuda:0" = output_42.to(torch.float32); output_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_77: "f32[s72, 4096][4096, 1]cuda:0" = x_76 + residual_12; x_76 = residual_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_13: "bf16[s72, 4096][4096, 1]cuda:0" = x_77.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_15: "f32[s72, 4096][4096, 1]cuda:0" = x_77.pow(2) + variance_14: "f32[s72, 1][1, 1]cuda:0" = pow_15.mean(dim = -1, keepdim = True); pow_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_43: "f32[s72, 1][1, 1]cuda:0" = variance_14 + 1e-05; variance_14 = None + rsqrt_14: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_43); add_43 = None + x_78: "f32[s72, 4096][4096, 1]cuda:0" = x_77 * rsqrt_14; x_77 = rsqrt_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_79: "bf16[s72, 4096][4096, 1]cuda:0" = x_78.to(torch.bfloat16); x_78 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_80: "bf16[s72, 4096][4096, 1]cuda:0" = x_79 * _get_data_attr_14; x_79 = _get_data_attr_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_29: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_80, l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_80 = l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_7 = output_parallel_29.split([2048, 512, 512], dim = -1); output_parallel_29 = None + q_7: "bf16[s72, 2048][3072, 1]cuda:0" = split_7[0] + k_7: "bf16[s72, 512][3072, 1]cuda:0" = split_7[1] + v_7: "bf16[s72, 512][3072, 1]cuda:0" = split_7[2]; split_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_7: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_7: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_7); positions_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_21 = cos_sin_7.chunk(2, dim = -1); cos_sin_7 = None + cos_21: "bf16[s72, 64][128, 1]cuda:0" = chunk_21[0] + sin_21: "bf16[s72, 64][128, 1]cuda:0" = chunk_21[1]; chunk_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_51 = q_7.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_21: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_7.view(s72, -1, 128); q_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_7: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_21[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_7: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_21[(Ellipsis, slice(128, None, None))]; query_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_29: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_21.unsqueeze(-2) + cos_22: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_29.to(torch.bfloat16); unsqueeze_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_30: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_21.unsqueeze(-2) + sin_22: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_30.to(torch.bfloat16); unsqueeze_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_22 = torch.chunk(query_rot_7, 2, dim = -1); query_rot_7 = None + x1_14: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_22[0] + x2_14: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_22[1]; chunk_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_96: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_14 * cos_22 + mul_97: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_14 * sin_22 + o1_14: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_96 - mul_97; mul_96 = mul_97 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_98: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_14 * cos_22; x2_14 = cos_22 = None + mul_99: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_14 * sin_22; x1_14 = sin_22 = None + o2_14: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_98 + mul_99; mul_98 = mul_99 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_43: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_14, o2_14), dim = -1); o1_14 = o2_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_29: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_43, query_pass_7), dim = -1); output_43 = query_pass_7 = None + query_22: "bf16[s72, 2048][2048, 1]cuda:0" = cat_29.reshape(size_51); cat_29 = size_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_52 = k_7.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_21: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_7.view(s72, -1, 128); k_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_7: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_21[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_7: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_21[(Ellipsis, slice(128, None, None))]; key_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_31: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_21.unsqueeze(-2); cos_21 = None + cos_23: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_31.to(torch.bfloat16); unsqueeze_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_32: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_21.unsqueeze(-2); sin_21 = None + sin_23: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_32.to(torch.bfloat16); unsqueeze_32 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_23 = torch.chunk(key_rot_7, 2, dim = -1); key_rot_7 = None + x1_15: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_23[0] + x2_15: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_23[1]; chunk_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_100: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_15 * cos_23 + mul_101: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_15 * sin_23 + o1_15: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_100 - mul_101; mul_100 = mul_101 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_102: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_15 * cos_23; x2_15 = cos_23 = None + mul_103: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_15 * sin_23; x1_15 = sin_23 = None + o2_15: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_102 + mul_103; mul_102 = mul_103 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_44: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_15, o2_15), dim = -1); o1_15 = o2_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_31: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_44, key_pass_7), dim = -1); output_44 = key_pass_7 = None + key_22: "bf16[s72, 512][512, 1]cuda:0" = cat_31.reshape(size_52); cat_31 = size_52 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_53 = query_22.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_45: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_53, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_53 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_23: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_22.view(-1, 16, 128); query_22 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_46: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_45.view(-1, 16, 128); output_45 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_23: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_22.view(-1, 4, 128); key_22 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_7: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_7.view(-1, 4, 128); v_7 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_7 = torch.ops.vllm.unified_attention_with_output(query_23, key_23, value_7, output_46, 'model.layers.7.self_attn.attn'); query_23 = key_23 = value_7 = unified_attention_with_output_7 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_7: "bf16[s72, 2048][2048, 1]cuda:0" = output_46.view(-1, 2048); output_46 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_30: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_7, l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_7 = l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_47: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_30, group_name = 'tp:0'); output_parallel_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_15: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_81: "f32[s72, 4096][4096, 1]cuda:0" = output_47.to(torch.float32); output_47 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_82: "f32[s72, 4096][4096, 1]cuda:0" = x_81 + residual_13; x_81 = residual_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_14: "bf16[s72, 4096][4096, 1]cuda:0" = x_82.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_16: "f32[s72, 4096][4096, 1]cuda:0" = x_82.pow(2) + variance_15: "f32[s72, 1][1, 1]cuda:0" = pow_16.mean(dim = -1, keepdim = True); pow_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_47: "f32[s72, 1][1, 1]cuda:0" = variance_15 + 1e-05; variance_15 = None + rsqrt_15: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_47); add_47 = None + x_83: "f32[s72, 4096][4096, 1]cuda:0" = x_82 * rsqrt_15; x_82 = rsqrt_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_84: "bf16[s72, 4096][4096, 1]cuda:0" = x_83.to(torch.bfloat16); x_83 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_85: "bf16[s72, 4096][4096, 1]cuda:0" = x_84 * _get_data_attr_15; x_84 = _get_data_attr_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_31: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_85, l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_85 = l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_222: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_31[(Ellipsis, slice(None, 7168, None))] + silu_7: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_222); getitem_222 = None + getitem_223: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_31[(Ellipsis, slice(7168, None, None))]; output_parallel_31 = None + x_86: "bf16[s72, 7168][7168, 1]cuda:0" = silu_7 * getitem_223; silu_7 = getitem_223 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_32: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_86, l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, None); x_86 = l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_48: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_32, group_name = 'tp:0'); output_parallel_32 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_16: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_87: "f32[s72, 4096][4096, 1]cuda:0" = output_48.to(torch.float32); output_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_88: "f32[s72, 4096][4096, 1]cuda:0" = x_87 + residual_14; x_87 = residual_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_15: "bf16[s72, 4096][4096, 1]cuda:0" = x_88.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_17: "f32[s72, 4096][4096, 1]cuda:0" = x_88.pow(2) + variance_16: "f32[s72, 1][1, 1]cuda:0" = pow_17.mean(dim = -1, keepdim = True); pow_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_49: "f32[s72, 1][1, 1]cuda:0" = variance_16 + 1e-05; variance_16 = None + rsqrt_16: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_49); add_49 = None + x_89: "f32[s72, 4096][4096, 1]cuda:0" = x_88 * rsqrt_16; x_88 = rsqrt_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_90: "bf16[s72, 4096][4096, 1]cuda:0" = x_89.to(torch.bfloat16); x_89 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_91: "bf16[s72, 4096][4096, 1]cuda:0" = x_90 * _get_data_attr_16; x_90 = _get_data_attr_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_33: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_91, l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_91 = l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_8 = output_parallel_33.split([2048, 512, 512], dim = -1); output_parallel_33 = None + q_8: "bf16[s72, 2048][3072, 1]cuda:0" = split_8[0] + k_8: "bf16[s72, 512][3072, 1]cuda:0" = split_8[1] + v_8: "bf16[s72, 512][3072, 1]cuda:0" = split_8[2]; split_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_8: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_8: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_8); positions_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_24 = cos_sin_8.chunk(2, dim = -1); cos_sin_8 = None + cos_24: "bf16[s72, 64][128, 1]cuda:0" = chunk_24[0] + sin_24: "bf16[s72, 64][128, 1]cuda:0" = chunk_24[1]; chunk_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_58 = q_8.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_24: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_8.view(s72, -1, 128); q_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_8: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_24[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_8: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_24[(Ellipsis, slice(128, None, None))]; query_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_33: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_24.unsqueeze(-2) + cos_25: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_33.to(torch.bfloat16); unsqueeze_33 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_34: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_24.unsqueeze(-2) + sin_25: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_34.to(torch.bfloat16); unsqueeze_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_25 = torch.chunk(query_rot_8, 2, dim = -1); query_rot_8 = None + x1_16: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_25[0] + x2_16: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_25[1]; chunk_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_109: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_16 * cos_25 + mul_110: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_16 * sin_25 + o1_16: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_109 - mul_110; mul_109 = mul_110 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_111: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_16 * cos_25; x2_16 = cos_25 = None + mul_112: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_16 * sin_25; x1_16 = sin_25 = None + o2_16: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_111 + mul_112; mul_111 = mul_112 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_49: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_16, o2_16), dim = -1); o1_16 = o2_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_33: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_49, query_pass_8), dim = -1); output_49 = query_pass_8 = None + query_25: "bf16[s72, 2048][2048, 1]cuda:0" = cat_33.reshape(size_58); cat_33 = size_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_59 = k_8.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_24: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_8.view(s72, -1, 128); k_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_8: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_24[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_8: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_24[(Ellipsis, slice(128, None, None))]; key_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_35: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_24.unsqueeze(-2); cos_24 = None + cos_26: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_35.to(torch.bfloat16); unsqueeze_35 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_36: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_24.unsqueeze(-2); sin_24 = None + sin_26: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_36.to(torch.bfloat16); unsqueeze_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_26 = torch.chunk(key_rot_8, 2, dim = -1); key_rot_8 = None + x1_17: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_26[0] + x2_17: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_26[1]; chunk_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_113: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_17 * cos_26 + mul_114: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_17 * sin_26 + o1_17: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_113 - mul_114; mul_113 = mul_114 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_115: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_17 * cos_26; x2_17 = cos_26 = None + mul_116: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_17 * sin_26; x1_17 = sin_26 = None + o2_17: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_115 + mul_116; mul_115 = mul_116 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_50: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_17, o2_17), dim = -1); o1_17 = o2_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_35: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_50, key_pass_8), dim = -1); output_50 = key_pass_8 = None + key_25: "bf16[s72, 512][512, 1]cuda:0" = cat_35.reshape(size_59); cat_35 = size_59 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_60 = query_25.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_51: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_60, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_60 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_26: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_25.view(-1, 16, 128); query_25 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_52: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_51.view(-1, 16, 128); output_51 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_26: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_25.view(-1, 4, 128); key_25 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_8: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_8.view(-1, 4, 128); v_8 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_8 = torch.ops.vllm.unified_attention_with_output(query_26, key_26, value_8, output_52, 'model.layers.8.self_attn.attn'); query_26 = key_26 = value_8 = unified_attention_with_output_8 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_8: "bf16[s72, 2048][2048, 1]cuda:0" = output_52.view(-1, 2048); output_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_34: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_8, l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_8 = l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_53: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_34, group_name = 'tp:0'); output_parallel_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_17: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_92: "f32[s72, 4096][4096, 1]cuda:0" = output_53.to(torch.float32); output_53 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_93: "f32[s72, 4096][4096, 1]cuda:0" = x_92 + residual_15; x_92 = residual_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_16: "bf16[s72, 4096][4096, 1]cuda:0" = x_93.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_18: "f32[s72, 4096][4096, 1]cuda:0" = x_93.pow(2) + variance_17: "f32[s72, 1][1, 1]cuda:0" = pow_18.mean(dim = -1, keepdim = True); pow_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_53: "f32[s72, 1][1, 1]cuda:0" = variance_17 + 1e-05; variance_17 = None + rsqrt_17: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_53); add_53 = None + x_94: "f32[s72, 4096][4096, 1]cuda:0" = x_93 * rsqrt_17; x_93 = rsqrt_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_95: "bf16[s72, 4096][4096, 1]cuda:0" = x_94.to(torch.bfloat16); x_94 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_96: "bf16[s72, 4096][4096, 1]cuda:0" = x_95 * _get_data_attr_17; x_95 = _get_data_attr_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_35: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_96, l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_96 = l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_250: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_35[(Ellipsis, slice(None, 7168, None))] + silu_8: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_250); getitem_250 = None + getitem_251: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_35[(Ellipsis, slice(7168, None, None))]; output_parallel_35 = None + x_97: "bf16[s72, 7168][7168, 1]cuda:0" = silu_8 * getitem_251; silu_8 = getitem_251 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_36: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_97, l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, None); x_97 = l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_54: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_36, group_name = 'tp:0'); output_parallel_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_18: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_98: "f32[s72, 4096][4096, 1]cuda:0" = output_54.to(torch.float32); output_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_99: "f32[s72, 4096][4096, 1]cuda:0" = x_98 + residual_16; x_98 = residual_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_17: "bf16[s72, 4096][4096, 1]cuda:0" = x_99.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_19: "f32[s72, 4096][4096, 1]cuda:0" = x_99.pow(2) + variance_18: "f32[s72, 1][1, 1]cuda:0" = pow_19.mean(dim = -1, keepdim = True); pow_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_55: "f32[s72, 1][1, 1]cuda:0" = variance_18 + 1e-05; variance_18 = None + rsqrt_18: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_55); add_55 = None + x_100: "f32[s72, 4096][4096, 1]cuda:0" = x_99 * rsqrt_18; x_99 = rsqrt_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_101: "bf16[s72, 4096][4096, 1]cuda:0" = x_100.to(torch.bfloat16); x_100 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_102: "bf16[s72, 4096][4096, 1]cuda:0" = x_101 * _get_data_attr_18; x_101 = _get_data_attr_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_37: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_102, l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_102 = l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_9 = output_parallel_37.split([2048, 512, 512], dim = -1); output_parallel_37 = None + q_9: "bf16[s72, 2048][3072, 1]cuda:0" = split_9[0] + k_9: "bf16[s72, 512][3072, 1]cuda:0" = split_9[1] + v_9: "bf16[s72, 512][3072, 1]cuda:0" = split_9[2]; split_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_9: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_9: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_9); positions_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_27 = cos_sin_9.chunk(2, dim = -1); cos_sin_9 = None + cos_27: "bf16[s72, 64][128, 1]cuda:0" = chunk_27[0] + sin_27: "bf16[s72, 64][128, 1]cuda:0" = chunk_27[1]; chunk_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_65 = q_9.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_27: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_9.view(s72, -1, 128); q_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_9: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_27[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_9: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_27[(Ellipsis, slice(128, None, None))]; query_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_37: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_27.unsqueeze(-2) + cos_28: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_37.to(torch.bfloat16); unsqueeze_37 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_38: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_27.unsqueeze(-2) + sin_28: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_38.to(torch.bfloat16); unsqueeze_38 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_28 = torch.chunk(query_rot_9, 2, dim = -1); query_rot_9 = None + x1_18: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_28[0] + x2_18: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_28[1]; chunk_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_122: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_18 * cos_28 + mul_123: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_18 * sin_28 + o1_18: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_122 - mul_123; mul_122 = mul_123 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_124: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_18 * cos_28; x2_18 = cos_28 = None + mul_125: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_18 * sin_28; x1_18 = sin_28 = None + o2_18: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_124 + mul_125; mul_124 = mul_125 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_55: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_18, o2_18), dim = -1); o1_18 = o2_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_37: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_55, query_pass_9), dim = -1); output_55 = query_pass_9 = None + query_28: "bf16[s72, 2048][2048, 1]cuda:0" = cat_37.reshape(size_65); cat_37 = size_65 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_66 = k_9.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_27: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_9.view(s72, -1, 128); k_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_9: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_27[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_9: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_27[(Ellipsis, slice(128, None, None))]; key_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_39: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_27.unsqueeze(-2); cos_27 = None + cos_29: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_39.to(torch.bfloat16); unsqueeze_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_40: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_27.unsqueeze(-2); sin_27 = None + sin_29: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_40.to(torch.bfloat16); unsqueeze_40 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_29 = torch.chunk(key_rot_9, 2, dim = -1); key_rot_9 = None + x1_19: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_29[0] + x2_19: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_29[1]; chunk_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_126: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_19 * cos_29 + mul_127: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_19 * sin_29 + o1_19: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_126 - mul_127; mul_126 = mul_127 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_128: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_19 * cos_29; x2_19 = cos_29 = None + mul_129: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_19 * sin_29; x1_19 = sin_29 = None + o2_19: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_128 + mul_129; mul_128 = mul_129 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_56: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_19, o2_19), dim = -1); o1_19 = o2_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_39: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_56, key_pass_9), dim = -1); output_56 = key_pass_9 = None + key_28: "bf16[s72, 512][512, 1]cuda:0" = cat_39.reshape(size_66); cat_39 = size_66 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_67 = query_28.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_57: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_67, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_67 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_29: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_28.view(-1, 16, 128); query_28 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_58: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_57.view(-1, 16, 128); output_57 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_29: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_28.view(-1, 4, 128); key_28 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_9: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_9.view(-1, 4, 128); v_9 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_9 = torch.ops.vllm.unified_attention_with_output(query_29, key_29, value_9, output_58, 'model.layers.9.self_attn.attn'); query_29 = key_29 = value_9 = unified_attention_with_output_9 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_9: "bf16[s72, 2048][2048, 1]cuda:0" = output_58.view(-1, 2048); output_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_38: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_9, l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_9 = l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_59: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_38, group_name = 'tp:0'); output_parallel_38 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_19: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_103: "f32[s72, 4096][4096, 1]cuda:0" = output_59.to(torch.float32); output_59 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_104: "f32[s72, 4096][4096, 1]cuda:0" = x_103 + residual_17; x_103 = residual_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_18: "bf16[s72, 4096][4096, 1]cuda:0" = x_104.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_20: "f32[s72, 4096][4096, 1]cuda:0" = x_104.pow(2) + variance_19: "f32[s72, 1][1, 1]cuda:0" = pow_20.mean(dim = -1, keepdim = True); pow_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_59: "f32[s72, 1][1, 1]cuda:0" = variance_19 + 1e-05; variance_19 = None + rsqrt_19: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_59); add_59 = None + x_105: "f32[s72, 4096][4096, 1]cuda:0" = x_104 * rsqrt_19; x_104 = rsqrt_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_106: "bf16[s72, 4096][4096, 1]cuda:0" = x_105.to(torch.bfloat16); x_105 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_107: "bf16[s72, 4096][4096, 1]cuda:0" = x_106 * _get_data_attr_19; x_106 = _get_data_attr_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_39: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_107, l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_107 = l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_278: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_39[(Ellipsis, slice(None, 7168, None))] + silu_9: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_278); getitem_278 = None + getitem_279: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_39[(Ellipsis, slice(7168, None, None))]; output_parallel_39 = None + x_108: "bf16[s72, 7168][7168, 1]cuda:0" = silu_9 * getitem_279; silu_9 = getitem_279 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_40: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_108, l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, None); x_108 = l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_60: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_40, group_name = 'tp:0'); output_parallel_40 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_20: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_109: "f32[s72, 4096][4096, 1]cuda:0" = output_60.to(torch.float32); output_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_110: "f32[s72, 4096][4096, 1]cuda:0" = x_109 + residual_18; x_109 = residual_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_19: "bf16[s72, 4096][4096, 1]cuda:0" = x_110.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_21: "f32[s72, 4096][4096, 1]cuda:0" = x_110.pow(2) + variance_20: "f32[s72, 1][1, 1]cuda:0" = pow_21.mean(dim = -1, keepdim = True); pow_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_61: "f32[s72, 1][1, 1]cuda:0" = variance_20 + 1e-05; variance_20 = None + rsqrt_20: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_61); add_61 = None + x_111: "f32[s72, 4096][4096, 1]cuda:0" = x_110 * rsqrt_20; x_110 = rsqrt_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_112: "bf16[s72, 4096][4096, 1]cuda:0" = x_111.to(torch.bfloat16); x_111 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_113: "bf16[s72, 4096][4096, 1]cuda:0" = x_112 * _get_data_attr_20; x_112 = _get_data_attr_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_41: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_113, l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_113 = l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_10 = output_parallel_41.split([2048, 512, 512], dim = -1); output_parallel_41 = None + q_10: "bf16[s72, 2048][3072, 1]cuda:0" = split_10[0] + k_10: "bf16[s72, 512][3072, 1]cuda:0" = split_10[1] + v_10: "bf16[s72, 512][3072, 1]cuda:0" = split_10[2]; split_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_10: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_10: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_10); positions_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_30 = cos_sin_10.chunk(2, dim = -1); cos_sin_10 = None + cos_30: "bf16[s72, 64][128, 1]cuda:0" = chunk_30[0] + sin_30: "bf16[s72, 64][128, 1]cuda:0" = chunk_30[1]; chunk_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_72 = q_10.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_30: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_10.view(s72, -1, 128); q_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_10: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_30[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_10: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_30[(Ellipsis, slice(128, None, None))]; query_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_41: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_30.unsqueeze(-2) + cos_31: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_41.to(torch.bfloat16); unsqueeze_41 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_42: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_30.unsqueeze(-2) + sin_31: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_42.to(torch.bfloat16); unsqueeze_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_31 = torch.chunk(query_rot_10, 2, dim = -1); query_rot_10 = None + x1_20: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_31[0] + x2_20: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_31[1]; chunk_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_135: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_20 * cos_31 + mul_136: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_20 * sin_31 + o1_20: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_135 - mul_136; mul_135 = mul_136 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_137: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_20 * cos_31; x2_20 = cos_31 = None + mul_138: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_20 * sin_31; x1_20 = sin_31 = None + o2_20: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_137 + mul_138; mul_137 = mul_138 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_61: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_20, o2_20), dim = -1); o1_20 = o2_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_41: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_61, query_pass_10), dim = -1); output_61 = query_pass_10 = None + query_31: "bf16[s72, 2048][2048, 1]cuda:0" = cat_41.reshape(size_72); cat_41 = size_72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_73 = k_10.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_30: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_10.view(s72, -1, 128); k_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_10: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_30[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_10: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_30[(Ellipsis, slice(128, None, None))]; key_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_43: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_30.unsqueeze(-2); cos_30 = None + cos_32: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_43.to(torch.bfloat16); unsqueeze_43 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_44: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_30.unsqueeze(-2); sin_30 = None + sin_32: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_44.to(torch.bfloat16); unsqueeze_44 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_32 = torch.chunk(key_rot_10, 2, dim = -1); key_rot_10 = None + x1_21: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_32[0] + x2_21: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_32[1]; chunk_32 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_139: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_21 * cos_32 + mul_140: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_21 * sin_32 + o1_21: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_139 - mul_140; mul_139 = mul_140 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_141: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_21 * cos_32; x2_21 = cos_32 = None + mul_142: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_21 * sin_32; x1_21 = sin_32 = None + o2_21: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_141 + mul_142; mul_141 = mul_142 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_62: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_21, o2_21), dim = -1); o1_21 = o2_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_43: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_62, key_pass_10), dim = -1); output_62 = key_pass_10 = None + key_31: "bf16[s72, 512][512, 1]cuda:0" = cat_43.reshape(size_73); cat_43 = size_73 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_74 = query_31.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_63: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_74, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_74 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_32: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_31.view(-1, 16, 128); query_31 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_64: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_63.view(-1, 16, 128); output_63 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_32: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_31.view(-1, 4, 128); key_31 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_10: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_10.view(-1, 4, 128); v_10 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_10 = torch.ops.vllm.unified_attention_with_output(query_32, key_32, value_10, output_64, 'model.layers.10.self_attn.attn'); query_32 = key_32 = value_10 = unified_attention_with_output_10 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_10: "bf16[s72, 2048][2048, 1]cuda:0" = output_64.view(-1, 2048); output_64 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_42: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_10, l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_10 = l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_65: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_42, group_name = 'tp:0'); output_parallel_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_21: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_114: "f32[s72, 4096][4096, 1]cuda:0" = output_65.to(torch.float32); output_65 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_115: "f32[s72, 4096][4096, 1]cuda:0" = x_114 + residual_19; x_114 = residual_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_20: "bf16[s72, 4096][4096, 1]cuda:0" = x_115.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_22: "f32[s72, 4096][4096, 1]cuda:0" = x_115.pow(2) + variance_21: "f32[s72, 1][1, 1]cuda:0" = pow_22.mean(dim = -1, keepdim = True); pow_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_65: "f32[s72, 1][1, 1]cuda:0" = variance_21 + 1e-05; variance_21 = None + rsqrt_21: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_65); add_65 = None + x_116: "f32[s72, 4096][4096, 1]cuda:0" = x_115 * rsqrt_21; x_115 = rsqrt_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_117: "bf16[s72, 4096][4096, 1]cuda:0" = x_116.to(torch.bfloat16); x_116 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_118: "bf16[s72, 4096][4096, 1]cuda:0" = x_117 * _get_data_attr_21; x_117 = _get_data_attr_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_43: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_118, l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_118 = l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_306: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_43[(Ellipsis, slice(None, 7168, None))] + silu_10: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_306); getitem_306 = None + getitem_307: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_43[(Ellipsis, slice(7168, None, None))]; output_parallel_43 = None + x_119: "bf16[s72, 7168][7168, 1]cuda:0" = silu_10 * getitem_307; silu_10 = getitem_307 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_44: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_119, l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, None); x_119 = l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_66: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_44, group_name = 'tp:0'); output_parallel_44 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_22: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_120: "f32[s72, 4096][4096, 1]cuda:0" = output_66.to(torch.float32); output_66 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_121: "f32[s72, 4096][4096, 1]cuda:0" = x_120 + residual_20; x_120 = residual_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_21: "bf16[s72, 4096][4096, 1]cuda:0" = x_121.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_23: "f32[s72, 4096][4096, 1]cuda:0" = x_121.pow(2) + variance_22: "f32[s72, 1][1, 1]cuda:0" = pow_23.mean(dim = -1, keepdim = True); pow_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_67: "f32[s72, 1][1, 1]cuda:0" = variance_22 + 1e-05; variance_22 = None + rsqrt_22: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_67); add_67 = None + x_122: "f32[s72, 4096][4096, 1]cuda:0" = x_121 * rsqrt_22; x_121 = rsqrt_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_123: "bf16[s72, 4096][4096, 1]cuda:0" = x_122.to(torch.bfloat16); x_122 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_124: "bf16[s72, 4096][4096, 1]cuda:0" = x_123 * _get_data_attr_22; x_123 = _get_data_attr_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_45: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_124, l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_124 = l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_11 = output_parallel_45.split([2048, 512, 512], dim = -1); output_parallel_45 = None + q_11: "bf16[s72, 2048][3072, 1]cuda:0" = split_11[0] + k_11: "bf16[s72, 512][3072, 1]cuda:0" = split_11[1] + v_11: "bf16[s72, 512][3072, 1]cuda:0" = split_11[2]; split_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_11: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_11: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_11); positions_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_33 = cos_sin_11.chunk(2, dim = -1); cos_sin_11 = None + cos_33: "bf16[s72, 64][128, 1]cuda:0" = chunk_33[0] + sin_33: "bf16[s72, 64][128, 1]cuda:0" = chunk_33[1]; chunk_33 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_79 = q_11.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_33: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_11.view(s72, -1, 128); q_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_11: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_33[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_11: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_33[(Ellipsis, slice(128, None, None))]; query_33 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_45: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_33.unsqueeze(-2) + cos_34: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_45.to(torch.bfloat16); unsqueeze_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_46: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_33.unsqueeze(-2) + sin_34: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_46.to(torch.bfloat16); unsqueeze_46 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_34 = torch.chunk(query_rot_11, 2, dim = -1); query_rot_11 = None + x1_22: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_34[0] + x2_22: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_34[1]; chunk_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_148: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_22 * cos_34 + mul_149: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_22 * sin_34 + o1_22: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_148 - mul_149; mul_148 = mul_149 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_150: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_22 * cos_34; x2_22 = cos_34 = None + mul_151: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_22 * sin_34; x1_22 = sin_34 = None + o2_22: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_150 + mul_151; mul_150 = mul_151 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_67: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_22, o2_22), dim = -1); o1_22 = o2_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_45: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_67, query_pass_11), dim = -1); output_67 = query_pass_11 = None + query_34: "bf16[s72, 2048][2048, 1]cuda:0" = cat_45.reshape(size_79); cat_45 = size_79 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_80 = k_11.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_33: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_11.view(s72, -1, 128); k_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_11: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_33[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_11: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_33[(Ellipsis, slice(128, None, None))]; key_33 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_47: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_33.unsqueeze(-2); cos_33 = None + cos_35: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_47.to(torch.bfloat16); unsqueeze_47 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_48: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_33.unsqueeze(-2); sin_33 = None + sin_35: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_48.to(torch.bfloat16); unsqueeze_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_35 = torch.chunk(key_rot_11, 2, dim = -1); key_rot_11 = None + x1_23: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_35[0] + x2_23: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_35[1]; chunk_35 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_152: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_23 * cos_35 + mul_153: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_23 * sin_35 + o1_23: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_152 - mul_153; mul_152 = mul_153 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_154: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_23 * cos_35; x2_23 = cos_35 = None + mul_155: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_23 * sin_35; x1_23 = sin_35 = None + o2_23: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_154 + mul_155; mul_154 = mul_155 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_68: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_23, o2_23), dim = -1); o1_23 = o2_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_47: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_68, key_pass_11), dim = -1); output_68 = key_pass_11 = None + key_34: "bf16[s72, 512][512, 1]cuda:0" = cat_47.reshape(size_80); cat_47 = size_80 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_81 = query_34.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_69: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_81, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_81 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_35: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_34.view(-1, 16, 128); query_34 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_70: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_69.view(-1, 16, 128); output_69 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_35: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_34.view(-1, 4, 128); key_34 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_11: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_11.view(-1, 4, 128); v_11 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_11 = torch.ops.vllm.unified_attention_with_output(query_35, key_35, value_11, output_70, 'model.layers.11.self_attn.attn'); query_35 = key_35 = value_11 = unified_attention_with_output_11 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_11: "bf16[s72, 2048][2048, 1]cuda:0" = output_70.view(-1, 2048); output_70 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_46: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_11, l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_11 = l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_71: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_46, group_name = 'tp:0'); output_parallel_46 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_23: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_125: "f32[s72, 4096][4096, 1]cuda:0" = output_71.to(torch.float32); output_71 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_126: "f32[s72, 4096][4096, 1]cuda:0" = x_125 + residual_21; x_125 = residual_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_22: "bf16[s72, 4096][4096, 1]cuda:0" = x_126.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_24: "f32[s72, 4096][4096, 1]cuda:0" = x_126.pow(2) + variance_23: "f32[s72, 1][1, 1]cuda:0" = pow_24.mean(dim = -1, keepdim = True); pow_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_71: "f32[s72, 1][1, 1]cuda:0" = variance_23 + 1e-05; variance_23 = None + rsqrt_23: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_71); add_71 = None + x_127: "f32[s72, 4096][4096, 1]cuda:0" = x_126 * rsqrt_23; x_126 = rsqrt_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_128: "bf16[s72, 4096][4096, 1]cuda:0" = x_127.to(torch.bfloat16); x_127 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_129: "bf16[s72, 4096][4096, 1]cuda:0" = x_128 * _get_data_attr_23; x_128 = _get_data_attr_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_47: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_129, l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_129 = l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_334: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_47[(Ellipsis, slice(None, 7168, None))] + silu_11: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_334); getitem_334 = None + getitem_335: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_47[(Ellipsis, slice(7168, None, None))]; output_parallel_47 = None + x_130: "bf16[s72, 7168][7168, 1]cuda:0" = silu_11 * getitem_335; silu_11 = getitem_335 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_48: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_130, l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, None); x_130 = l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_72: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_48, group_name = 'tp:0'); output_parallel_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_24: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_131: "f32[s72, 4096][4096, 1]cuda:0" = output_72.to(torch.float32); output_72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_132: "f32[s72, 4096][4096, 1]cuda:0" = x_131 + residual_22; x_131 = residual_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_23: "bf16[s72, 4096][4096, 1]cuda:0" = x_132.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_25: "f32[s72, 4096][4096, 1]cuda:0" = x_132.pow(2) + variance_24: "f32[s72, 1][1, 1]cuda:0" = pow_25.mean(dim = -1, keepdim = True); pow_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_73: "f32[s72, 1][1, 1]cuda:0" = variance_24 + 1e-05; variance_24 = None + rsqrt_24: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_73); add_73 = None + x_133: "f32[s72, 4096][4096, 1]cuda:0" = x_132 * rsqrt_24; x_132 = rsqrt_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_134: "bf16[s72, 4096][4096, 1]cuda:0" = x_133.to(torch.bfloat16); x_133 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_135: "bf16[s72, 4096][4096, 1]cuda:0" = x_134 * _get_data_attr_24; x_134 = _get_data_attr_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_49: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_135, l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_135 = l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_12 = output_parallel_49.split([2048, 512, 512], dim = -1); output_parallel_49 = None + q_12: "bf16[s72, 2048][3072, 1]cuda:0" = split_12[0] + k_12: "bf16[s72, 512][3072, 1]cuda:0" = split_12[1] + v_12: "bf16[s72, 512][3072, 1]cuda:0" = split_12[2]; split_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_12: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_12: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_12); positions_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_36 = cos_sin_12.chunk(2, dim = -1); cos_sin_12 = None + cos_36: "bf16[s72, 64][128, 1]cuda:0" = chunk_36[0] + sin_36: "bf16[s72, 64][128, 1]cuda:0" = chunk_36[1]; chunk_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_86 = q_12.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_36: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_12.view(s72, -1, 128); q_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_12: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_36[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_12: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_36[(Ellipsis, slice(128, None, None))]; query_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_49: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_36.unsqueeze(-2) + cos_37: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_49.to(torch.bfloat16); unsqueeze_49 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_50: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_36.unsqueeze(-2) + sin_37: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_50.to(torch.bfloat16); unsqueeze_50 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_37 = torch.chunk(query_rot_12, 2, dim = -1); query_rot_12 = None + x1_24: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_37[0] + x2_24: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_37[1]; chunk_37 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_161: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_24 * cos_37 + mul_162: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_24 * sin_37 + o1_24: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_161 - mul_162; mul_161 = mul_162 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_163: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_24 * cos_37; x2_24 = cos_37 = None + mul_164: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_24 * sin_37; x1_24 = sin_37 = None + o2_24: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_163 + mul_164; mul_163 = mul_164 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_73: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_24, o2_24), dim = -1); o1_24 = o2_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_49: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_73, query_pass_12), dim = -1); output_73 = query_pass_12 = None + query_37: "bf16[s72, 2048][2048, 1]cuda:0" = cat_49.reshape(size_86); cat_49 = size_86 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_87 = k_12.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_36: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_12.view(s72, -1, 128); k_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_12: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_36[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_12: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_36[(Ellipsis, slice(128, None, None))]; key_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_51: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_36.unsqueeze(-2); cos_36 = None + cos_38: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_51.to(torch.bfloat16); unsqueeze_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_52: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_36.unsqueeze(-2); sin_36 = None + sin_38: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_52.to(torch.bfloat16); unsqueeze_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_38 = torch.chunk(key_rot_12, 2, dim = -1); key_rot_12 = None + x1_25: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_38[0] + x2_25: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_38[1]; chunk_38 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_165: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_25 * cos_38 + mul_166: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_25 * sin_38 + o1_25: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_165 - mul_166; mul_165 = mul_166 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_167: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_25 * cos_38; x2_25 = cos_38 = None + mul_168: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_25 * sin_38; x1_25 = sin_38 = None + o2_25: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_167 + mul_168; mul_167 = mul_168 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_74: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_25, o2_25), dim = -1); o1_25 = o2_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_51: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_74, key_pass_12), dim = -1); output_74 = key_pass_12 = None + key_37: "bf16[s72, 512][512, 1]cuda:0" = cat_51.reshape(size_87); cat_51 = size_87 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_88 = query_37.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_75: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_88, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_88 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_38: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_37.view(-1, 16, 128); query_37 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_76: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_75.view(-1, 16, 128); output_75 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_38: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_37.view(-1, 4, 128); key_37 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_12: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_12.view(-1, 4, 128); v_12 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_12 = torch.ops.vllm.unified_attention_with_output(query_38, key_38, value_12, output_76, 'model.layers.12.self_attn.attn'); query_38 = key_38 = value_12 = unified_attention_with_output_12 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_12: "bf16[s72, 2048][2048, 1]cuda:0" = output_76.view(-1, 2048); output_76 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_50: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_12, l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_12 = l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_77: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_50, group_name = 'tp:0'); output_parallel_50 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_25: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_136: "f32[s72, 4096][4096, 1]cuda:0" = output_77.to(torch.float32); output_77 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_137: "f32[s72, 4096][4096, 1]cuda:0" = x_136 + residual_23; x_136 = residual_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_24: "bf16[s72, 4096][4096, 1]cuda:0" = x_137.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_26: "f32[s72, 4096][4096, 1]cuda:0" = x_137.pow(2) + variance_25: "f32[s72, 1][1, 1]cuda:0" = pow_26.mean(dim = -1, keepdim = True); pow_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_77: "f32[s72, 1][1, 1]cuda:0" = variance_25 + 1e-05; variance_25 = None + rsqrt_25: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_77); add_77 = None + x_138: "f32[s72, 4096][4096, 1]cuda:0" = x_137 * rsqrt_25; x_137 = rsqrt_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_139: "bf16[s72, 4096][4096, 1]cuda:0" = x_138.to(torch.bfloat16); x_138 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_140: "bf16[s72, 4096][4096, 1]cuda:0" = x_139 * _get_data_attr_25; x_139 = _get_data_attr_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_51: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_140, l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_140 = l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_362: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_51[(Ellipsis, slice(None, 7168, None))] + silu_12: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_362); getitem_362 = None + getitem_363: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_51[(Ellipsis, slice(7168, None, None))]; output_parallel_51 = None + x_141: "bf16[s72, 7168][7168, 1]cuda:0" = silu_12 * getitem_363; silu_12 = getitem_363 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_52: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_141, l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, None); x_141 = l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_78: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_52, group_name = 'tp:0'); output_parallel_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_26: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_142: "f32[s72, 4096][4096, 1]cuda:0" = output_78.to(torch.float32); output_78 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_143: "f32[s72, 4096][4096, 1]cuda:0" = x_142 + residual_24; x_142 = residual_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_25: "bf16[s72, 4096][4096, 1]cuda:0" = x_143.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_27: "f32[s72, 4096][4096, 1]cuda:0" = x_143.pow(2) + variance_26: "f32[s72, 1][1, 1]cuda:0" = pow_27.mean(dim = -1, keepdim = True); pow_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_79: "f32[s72, 1][1, 1]cuda:0" = variance_26 + 1e-05; variance_26 = None + rsqrt_26: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_79); add_79 = None + x_144: "f32[s72, 4096][4096, 1]cuda:0" = x_143 * rsqrt_26; x_143 = rsqrt_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_145: "bf16[s72, 4096][4096, 1]cuda:0" = x_144.to(torch.bfloat16); x_144 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_146: "bf16[s72, 4096][4096, 1]cuda:0" = x_145 * _get_data_attr_26; x_145 = _get_data_attr_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_53: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_146, l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_146 = l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_13 = output_parallel_53.split([2048, 512, 512], dim = -1); output_parallel_53 = None + q_13: "bf16[s72, 2048][3072, 1]cuda:0" = split_13[0] + k_13: "bf16[s72, 512][3072, 1]cuda:0" = split_13[1] + v_13: "bf16[s72, 512][3072, 1]cuda:0" = split_13[2]; split_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_13: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_13: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_13); positions_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_39 = cos_sin_13.chunk(2, dim = -1); cos_sin_13 = None + cos_39: "bf16[s72, 64][128, 1]cuda:0" = chunk_39[0] + sin_39: "bf16[s72, 64][128, 1]cuda:0" = chunk_39[1]; chunk_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_93 = q_13.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_39: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_13.view(s72, -1, 128); q_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_13: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_39[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_13: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_39[(Ellipsis, slice(128, None, None))]; query_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_53: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_39.unsqueeze(-2) + cos_40: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_53.to(torch.bfloat16); unsqueeze_53 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_54: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_39.unsqueeze(-2) + sin_40: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_54.to(torch.bfloat16); unsqueeze_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_40 = torch.chunk(query_rot_13, 2, dim = -1); query_rot_13 = None + x1_26: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_40[0] + x2_26: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_40[1]; chunk_40 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_174: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_26 * cos_40 + mul_175: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_26 * sin_40 + o1_26: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_174 - mul_175; mul_174 = mul_175 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_176: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_26 * cos_40; x2_26 = cos_40 = None + mul_177: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_26 * sin_40; x1_26 = sin_40 = None + o2_26: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_176 + mul_177; mul_176 = mul_177 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_79: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_26, o2_26), dim = -1); o1_26 = o2_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_53: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_79, query_pass_13), dim = -1); output_79 = query_pass_13 = None + query_40: "bf16[s72, 2048][2048, 1]cuda:0" = cat_53.reshape(size_93); cat_53 = size_93 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_94 = k_13.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_39: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_13.view(s72, -1, 128); k_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_13: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_39[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_13: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_39[(Ellipsis, slice(128, None, None))]; key_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_55: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_39.unsqueeze(-2); cos_39 = None + cos_41: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_55.to(torch.bfloat16); unsqueeze_55 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_56: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_39.unsqueeze(-2); sin_39 = None + sin_41: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_56.to(torch.bfloat16); unsqueeze_56 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_41 = torch.chunk(key_rot_13, 2, dim = -1); key_rot_13 = None + x1_27: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_41[0] + x2_27: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_41[1]; chunk_41 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_178: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_27 * cos_41 + mul_179: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_27 * sin_41 + o1_27: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_178 - mul_179; mul_178 = mul_179 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_180: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_27 * cos_41; x2_27 = cos_41 = None + mul_181: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_27 * sin_41; x1_27 = sin_41 = None + o2_27: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_180 + mul_181; mul_180 = mul_181 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_80: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_27, o2_27), dim = -1); o1_27 = o2_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_55: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_80, key_pass_13), dim = -1); output_80 = key_pass_13 = None + key_40: "bf16[s72, 512][512, 1]cuda:0" = cat_55.reshape(size_94); cat_55 = size_94 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_95 = query_40.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_81: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_95, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_95 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_41: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_40.view(-1, 16, 128); query_40 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_82: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_81.view(-1, 16, 128); output_81 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_41: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_40.view(-1, 4, 128); key_40 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_13: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_13.view(-1, 4, 128); v_13 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_13 = torch.ops.vllm.unified_attention_with_output(query_41, key_41, value_13, output_82, 'model.layers.13.self_attn.attn'); query_41 = key_41 = value_13 = unified_attention_with_output_13 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_13: "bf16[s72, 2048][2048, 1]cuda:0" = output_82.view(-1, 2048); output_82 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_54: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_13, l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_13 = l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_83: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_54, group_name = 'tp:0'); output_parallel_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_27: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_147: "f32[s72, 4096][4096, 1]cuda:0" = output_83.to(torch.float32); output_83 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_148: "f32[s72, 4096][4096, 1]cuda:0" = x_147 + residual_25; x_147 = residual_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_26: "bf16[s72, 4096][4096, 1]cuda:0" = x_148.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_28: "f32[s72, 4096][4096, 1]cuda:0" = x_148.pow(2) + variance_27: "f32[s72, 1][1, 1]cuda:0" = pow_28.mean(dim = -1, keepdim = True); pow_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_83: "f32[s72, 1][1, 1]cuda:0" = variance_27 + 1e-05; variance_27 = None + rsqrt_27: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_83); add_83 = None + x_149: "f32[s72, 4096][4096, 1]cuda:0" = x_148 * rsqrt_27; x_148 = rsqrt_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_150: "bf16[s72, 4096][4096, 1]cuda:0" = x_149.to(torch.bfloat16); x_149 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_151: "bf16[s72, 4096][4096, 1]cuda:0" = x_150 * _get_data_attr_27; x_150 = _get_data_attr_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_55: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_151, l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_151 = l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_390: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_55[(Ellipsis, slice(None, 7168, None))] + silu_13: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_390); getitem_390 = None + getitem_391: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_55[(Ellipsis, slice(7168, None, None))]; output_parallel_55 = None + x_152: "bf16[s72, 7168][7168, 1]cuda:0" = silu_13 * getitem_391; silu_13 = getitem_391 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_56: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_152, l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, None); x_152 = l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_84: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_56, group_name = 'tp:0'); output_parallel_56 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_28: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_153: "f32[s72, 4096][4096, 1]cuda:0" = output_84.to(torch.float32); output_84 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_154: "f32[s72, 4096][4096, 1]cuda:0" = x_153 + residual_26; x_153 = residual_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_27: "bf16[s72, 4096][4096, 1]cuda:0" = x_154.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_29: "f32[s72, 4096][4096, 1]cuda:0" = x_154.pow(2) + variance_28: "f32[s72, 1][1, 1]cuda:0" = pow_29.mean(dim = -1, keepdim = True); pow_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_85: "f32[s72, 1][1, 1]cuda:0" = variance_28 + 1e-05; variance_28 = None + rsqrt_28: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_85); add_85 = None + x_155: "f32[s72, 4096][4096, 1]cuda:0" = x_154 * rsqrt_28; x_154 = rsqrt_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_156: "bf16[s72, 4096][4096, 1]cuda:0" = x_155.to(torch.bfloat16); x_155 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_157: "bf16[s72, 4096][4096, 1]cuda:0" = x_156 * _get_data_attr_28; x_156 = _get_data_attr_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_57: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_157, l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_157 = l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_14 = output_parallel_57.split([2048, 512, 512], dim = -1); output_parallel_57 = None + q_14: "bf16[s72, 2048][3072, 1]cuda:0" = split_14[0] + k_14: "bf16[s72, 512][3072, 1]cuda:0" = split_14[1] + v_14: "bf16[s72, 512][3072, 1]cuda:0" = split_14[2]; split_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_14: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_14: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_14); positions_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_42 = cos_sin_14.chunk(2, dim = -1); cos_sin_14 = None + cos_42: "bf16[s72, 64][128, 1]cuda:0" = chunk_42[0] + sin_42: "bf16[s72, 64][128, 1]cuda:0" = chunk_42[1]; chunk_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_100 = q_14.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_42: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_14.view(s72, -1, 128); q_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_14: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_42[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_14: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_42[(Ellipsis, slice(128, None, None))]; query_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_57: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_42.unsqueeze(-2) + cos_43: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_57.to(torch.bfloat16); unsqueeze_57 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_58: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_42.unsqueeze(-2) + sin_43: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_58.to(torch.bfloat16); unsqueeze_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_43 = torch.chunk(query_rot_14, 2, dim = -1); query_rot_14 = None + x1_28: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_43[0] + x2_28: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_43[1]; chunk_43 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_187: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_28 * cos_43 + mul_188: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_28 * sin_43 + o1_28: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_187 - mul_188; mul_187 = mul_188 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_189: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_28 * cos_43; x2_28 = cos_43 = None + mul_190: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_28 * sin_43; x1_28 = sin_43 = None + o2_28: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_189 + mul_190; mul_189 = mul_190 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_85: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_28, o2_28), dim = -1); o1_28 = o2_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_57: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_85, query_pass_14), dim = -1); output_85 = query_pass_14 = None + query_43: "bf16[s72, 2048][2048, 1]cuda:0" = cat_57.reshape(size_100); cat_57 = size_100 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_101 = k_14.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_42: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_14.view(s72, -1, 128); k_14 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_14: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_42[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_14: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_42[(Ellipsis, slice(128, None, None))]; key_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_59: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_42.unsqueeze(-2); cos_42 = None + cos_44: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_59.to(torch.bfloat16); unsqueeze_59 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_60: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_42.unsqueeze(-2); sin_42 = None + sin_44: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_60.to(torch.bfloat16); unsqueeze_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_44 = torch.chunk(key_rot_14, 2, dim = -1); key_rot_14 = None + x1_29: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_44[0] + x2_29: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_44[1]; chunk_44 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_191: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_29 * cos_44 + mul_192: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_29 * sin_44 + o1_29: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_191 - mul_192; mul_191 = mul_192 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_193: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_29 * cos_44; x2_29 = cos_44 = None + mul_194: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_29 * sin_44; x1_29 = sin_44 = None + o2_29: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_193 + mul_194; mul_193 = mul_194 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_86: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_29, o2_29), dim = -1); o1_29 = o2_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_59: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_86, key_pass_14), dim = -1); output_86 = key_pass_14 = None + key_43: "bf16[s72, 512][512, 1]cuda:0" = cat_59.reshape(size_101); cat_59 = size_101 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_102 = query_43.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_87: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_102, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_102 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_44: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_43.view(-1, 16, 128); query_43 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_88: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_87.view(-1, 16, 128); output_87 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_44: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_43.view(-1, 4, 128); key_43 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_14: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_14.view(-1, 4, 128); v_14 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_14 = torch.ops.vllm.unified_attention_with_output(query_44, key_44, value_14, output_88, 'model.layers.14.self_attn.attn'); query_44 = key_44 = value_14 = unified_attention_with_output_14 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_14: "bf16[s72, 2048][2048, 1]cuda:0" = output_88.view(-1, 2048); output_88 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_58: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_14, l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_14 = l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_89: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_58, group_name = 'tp:0'); output_parallel_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_29: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_158: "f32[s72, 4096][4096, 1]cuda:0" = output_89.to(torch.float32); output_89 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_159: "f32[s72, 4096][4096, 1]cuda:0" = x_158 + residual_27; x_158 = residual_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_28: "bf16[s72, 4096][4096, 1]cuda:0" = x_159.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_30: "f32[s72, 4096][4096, 1]cuda:0" = x_159.pow(2) + variance_29: "f32[s72, 1][1, 1]cuda:0" = pow_30.mean(dim = -1, keepdim = True); pow_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_89: "f32[s72, 1][1, 1]cuda:0" = variance_29 + 1e-05; variance_29 = None + rsqrt_29: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_89); add_89 = None + x_160: "f32[s72, 4096][4096, 1]cuda:0" = x_159 * rsqrt_29; x_159 = rsqrt_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_161: "bf16[s72, 4096][4096, 1]cuda:0" = x_160.to(torch.bfloat16); x_160 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_162: "bf16[s72, 4096][4096, 1]cuda:0" = x_161 * _get_data_attr_29; x_161 = _get_data_attr_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_59: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_162, l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_162 = l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_418: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_59[(Ellipsis, slice(None, 7168, None))] + silu_14: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_418); getitem_418 = None + getitem_419: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_59[(Ellipsis, slice(7168, None, None))]; output_parallel_59 = None + x_163: "bf16[s72, 7168][7168, 1]cuda:0" = silu_14 * getitem_419; silu_14 = getitem_419 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_60: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_163, l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, None); x_163 = l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_90: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_60, group_name = 'tp:0'); output_parallel_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_30: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_164: "f32[s72, 4096][4096, 1]cuda:0" = output_90.to(torch.float32); output_90 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_165: "f32[s72, 4096][4096, 1]cuda:0" = x_164 + residual_28; x_164 = residual_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_29: "bf16[s72, 4096][4096, 1]cuda:0" = x_165.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_31: "f32[s72, 4096][4096, 1]cuda:0" = x_165.pow(2) + variance_30: "f32[s72, 1][1, 1]cuda:0" = pow_31.mean(dim = -1, keepdim = True); pow_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_91: "f32[s72, 1][1, 1]cuda:0" = variance_30 + 1e-05; variance_30 = None + rsqrt_30: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_91); add_91 = None + x_166: "f32[s72, 4096][4096, 1]cuda:0" = x_165 * rsqrt_30; x_165 = rsqrt_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_167: "bf16[s72, 4096][4096, 1]cuda:0" = x_166.to(torch.bfloat16); x_166 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_168: "bf16[s72, 4096][4096, 1]cuda:0" = x_167 * _get_data_attr_30; x_167 = _get_data_attr_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_61: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_168, l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_168 = l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_15 = output_parallel_61.split([2048, 512, 512], dim = -1); output_parallel_61 = None + q_15: "bf16[s72, 2048][3072, 1]cuda:0" = split_15[0] + k_15: "bf16[s72, 512][3072, 1]cuda:0" = split_15[1] + v_15: "bf16[s72, 512][3072, 1]cuda:0" = split_15[2]; split_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_15: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_15: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_15); positions_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_45 = cos_sin_15.chunk(2, dim = -1); cos_sin_15 = None + cos_45: "bf16[s72, 64][128, 1]cuda:0" = chunk_45[0] + sin_45: "bf16[s72, 64][128, 1]cuda:0" = chunk_45[1]; chunk_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_107 = q_15.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_45: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_15.view(s72, -1, 128); q_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_15: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_45[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_15: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_45[(Ellipsis, slice(128, None, None))]; query_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_61: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_45.unsqueeze(-2) + cos_46: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_61.to(torch.bfloat16); unsqueeze_61 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_62: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_45.unsqueeze(-2) + sin_46: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_62.to(torch.bfloat16); unsqueeze_62 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_46 = torch.chunk(query_rot_15, 2, dim = -1); query_rot_15 = None + x1_30: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_46[0] + x2_30: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_46[1]; chunk_46 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_200: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_30 * cos_46 + mul_201: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_30 * sin_46 + o1_30: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_200 - mul_201; mul_200 = mul_201 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_202: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_30 * cos_46; x2_30 = cos_46 = None + mul_203: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_30 * sin_46; x1_30 = sin_46 = None + o2_30: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_202 + mul_203; mul_202 = mul_203 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_91: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_30, o2_30), dim = -1); o1_30 = o2_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_61: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_91, query_pass_15), dim = -1); output_91 = query_pass_15 = None + query_46: "bf16[s72, 2048][2048, 1]cuda:0" = cat_61.reshape(size_107); cat_61 = size_107 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_108 = k_15.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_45: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_15.view(s72, -1, 128); k_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_15: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_45[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_15: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_45[(Ellipsis, slice(128, None, None))]; key_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_63: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_45.unsqueeze(-2); cos_45 = None + cos_47: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_63.to(torch.bfloat16); unsqueeze_63 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_64: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_45.unsqueeze(-2); sin_45 = None + sin_47: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_64.to(torch.bfloat16); unsqueeze_64 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_47 = torch.chunk(key_rot_15, 2, dim = -1); key_rot_15 = None + x1_31: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_47[0] + x2_31: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_47[1]; chunk_47 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_204: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_31 * cos_47 + mul_205: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_31 * sin_47 + o1_31: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_204 - mul_205; mul_204 = mul_205 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_206: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_31 * cos_47; x2_31 = cos_47 = None + mul_207: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_31 * sin_47; x1_31 = sin_47 = None + o2_31: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_206 + mul_207; mul_206 = mul_207 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_92: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_31, o2_31), dim = -1); o1_31 = o2_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_63: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_92, key_pass_15), dim = -1); output_92 = key_pass_15 = None + key_46: "bf16[s72, 512][512, 1]cuda:0" = cat_63.reshape(size_108); cat_63 = size_108 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_109 = query_46.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_93: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_109, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_109 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_47: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_46.view(-1, 16, 128); query_46 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_94: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_93.view(-1, 16, 128); output_93 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_47: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_46.view(-1, 4, 128); key_46 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_15: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_15.view(-1, 4, 128); v_15 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_15 = torch.ops.vllm.unified_attention_with_output(query_47, key_47, value_15, output_94, 'model.layers.15.self_attn.attn'); query_47 = key_47 = value_15 = unified_attention_with_output_15 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_15: "bf16[s72, 2048][2048, 1]cuda:0" = output_94.view(-1, 2048); output_94 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_62: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_15, l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_15 = l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_95: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_62, group_name = 'tp:0'); output_parallel_62 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_31: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_169: "f32[s72, 4096][4096, 1]cuda:0" = output_95.to(torch.float32); output_95 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_170: "f32[s72, 4096][4096, 1]cuda:0" = x_169 + residual_29; x_169 = residual_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_30: "bf16[s72, 4096][4096, 1]cuda:0" = x_170.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_32: "f32[s72, 4096][4096, 1]cuda:0" = x_170.pow(2) + variance_31: "f32[s72, 1][1, 1]cuda:0" = pow_32.mean(dim = -1, keepdim = True); pow_32 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_95: "f32[s72, 1][1, 1]cuda:0" = variance_31 + 1e-05; variance_31 = None + rsqrt_31: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_95); add_95 = None + x_171: "f32[s72, 4096][4096, 1]cuda:0" = x_170 * rsqrt_31; x_170 = rsqrt_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_172: "bf16[s72, 4096][4096, 1]cuda:0" = x_171.to(torch.bfloat16); x_171 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_173: "bf16[s72, 4096][4096, 1]cuda:0" = x_172 * _get_data_attr_31; x_172 = _get_data_attr_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_63: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_173, l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_173 = l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_446: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_63[(Ellipsis, slice(None, 7168, None))] + silu_15: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_446); getitem_446 = None + getitem_447: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_63[(Ellipsis, slice(7168, None, None))]; output_parallel_63 = None + x_174: "bf16[s72, 7168][7168, 1]cuda:0" = silu_15 * getitem_447; silu_15 = getitem_447 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_64: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_174, l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, None); x_174 = l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_96: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_64, group_name = 'tp:0'); output_parallel_64 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_32: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_175: "f32[s72, 4096][4096, 1]cuda:0" = output_96.to(torch.float32); output_96 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_176: "f32[s72, 4096][4096, 1]cuda:0" = x_175 + residual_30; x_175 = residual_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_31: "bf16[s72, 4096][4096, 1]cuda:0" = x_176.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_33: "f32[s72, 4096][4096, 1]cuda:0" = x_176.pow(2) + variance_32: "f32[s72, 1][1, 1]cuda:0" = pow_33.mean(dim = -1, keepdim = True); pow_33 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_97: "f32[s72, 1][1, 1]cuda:0" = variance_32 + 1e-05; variance_32 = None + rsqrt_32: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_97); add_97 = None + x_177: "f32[s72, 4096][4096, 1]cuda:0" = x_176 * rsqrt_32; x_176 = rsqrt_32 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_178: "bf16[s72, 4096][4096, 1]cuda:0" = x_177.to(torch.bfloat16); x_177 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_179: "bf16[s72, 4096][4096, 1]cuda:0" = x_178 * _get_data_attr_32; x_178 = _get_data_attr_32 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_65: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_179, l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_179 = l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_16 = output_parallel_65.split([2048, 512, 512], dim = -1); output_parallel_65 = None + q_16: "bf16[s72, 2048][3072, 1]cuda:0" = split_16[0] + k_16: "bf16[s72, 512][3072, 1]cuda:0" = split_16[1] + v_16: "bf16[s72, 512][3072, 1]cuda:0" = split_16[2]; split_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_16: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_16: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_16); positions_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_48 = cos_sin_16.chunk(2, dim = -1); cos_sin_16 = None + cos_48: "bf16[s72, 64][128, 1]cuda:0" = chunk_48[0] + sin_48: "bf16[s72, 64][128, 1]cuda:0" = chunk_48[1]; chunk_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_114 = q_16.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_48: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_16.view(s72, -1, 128); q_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_16: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_48[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_16: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_48[(Ellipsis, slice(128, None, None))]; query_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_65: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_48.unsqueeze(-2) + cos_49: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_65.to(torch.bfloat16); unsqueeze_65 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_66: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_48.unsqueeze(-2) + sin_49: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_66.to(torch.bfloat16); unsqueeze_66 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_49 = torch.chunk(query_rot_16, 2, dim = -1); query_rot_16 = None + x1_32: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_49[0] + x2_32: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_49[1]; chunk_49 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_213: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_32 * cos_49 + mul_214: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_32 * sin_49 + o1_32: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_213 - mul_214; mul_213 = mul_214 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_215: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_32 * cos_49; x2_32 = cos_49 = None + mul_216: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_32 * sin_49; x1_32 = sin_49 = None + o2_32: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_215 + mul_216; mul_215 = mul_216 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_97: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_32, o2_32), dim = -1); o1_32 = o2_32 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_65: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_97, query_pass_16), dim = -1); output_97 = query_pass_16 = None + query_49: "bf16[s72, 2048][2048, 1]cuda:0" = cat_65.reshape(size_114); cat_65 = size_114 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_115 = k_16.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_48: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_16.view(s72, -1, 128); k_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_16: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_48[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_16: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_48[(Ellipsis, slice(128, None, None))]; key_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_67: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_48.unsqueeze(-2); cos_48 = None + cos_50: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_67.to(torch.bfloat16); unsqueeze_67 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_68: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_48.unsqueeze(-2); sin_48 = None + sin_50: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_68.to(torch.bfloat16); unsqueeze_68 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_50 = torch.chunk(key_rot_16, 2, dim = -1); key_rot_16 = None + x1_33: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_50[0] + x2_33: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_50[1]; chunk_50 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_217: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_33 * cos_50 + mul_218: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_33 * sin_50 + o1_33: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_217 - mul_218; mul_217 = mul_218 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_219: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_33 * cos_50; x2_33 = cos_50 = None + mul_220: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_33 * sin_50; x1_33 = sin_50 = None + o2_33: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_219 + mul_220; mul_219 = mul_220 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_98: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_33, o2_33), dim = -1); o1_33 = o2_33 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_67: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_98, key_pass_16), dim = -1); output_98 = key_pass_16 = None + key_49: "bf16[s72, 512][512, 1]cuda:0" = cat_67.reshape(size_115); cat_67 = size_115 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_116 = query_49.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_99: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_116, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_116 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_50: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_49.view(-1, 16, 128); query_49 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_100: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_99.view(-1, 16, 128); output_99 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_50: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_49.view(-1, 4, 128); key_49 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_16: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_16.view(-1, 4, 128); v_16 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_16 = torch.ops.vllm.unified_attention_with_output(query_50, key_50, value_16, output_100, 'model.layers.16.self_attn.attn'); query_50 = key_50 = value_16 = unified_attention_with_output_16 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_16: "bf16[s72, 2048][2048, 1]cuda:0" = output_100.view(-1, 2048); output_100 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_66: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_16, l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_16 = l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_101: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_66, group_name = 'tp:0'); output_parallel_66 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_33: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_180: "f32[s72, 4096][4096, 1]cuda:0" = output_101.to(torch.float32); output_101 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_181: "f32[s72, 4096][4096, 1]cuda:0" = x_180 + residual_31; x_180 = residual_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_32: "bf16[s72, 4096][4096, 1]cuda:0" = x_181.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_34: "f32[s72, 4096][4096, 1]cuda:0" = x_181.pow(2) + variance_33: "f32[s72, 1][1, 1]cuda:0" = pow_34.mean(dim = -1, keepdim = True); pow_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_101: "f32[s72, 1][1, 1]cuda:0" = variance_33 + 1e-05; variance_33 = None + rsqrt_33: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_101); add_101 = None + x_182: "f32[s72, 4096][4096, 1]cuda:0" = x_181 * rsqrt_33; x_181 = rsqrt_33 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_183: "bf16[s72, 4096][4096, 1]cuda:0" = x_182.to(torch.bfloat16); x_182 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_184: "bf16[s72, 4096][4096, 1]cuda:0" = x_183 * _get_data_attr_33; x_183 = _get_data_attr_33 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_67: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_184, l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_184 = l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_474: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_67[(Ellipsis, slice(None, 7168, None))] + silu_16: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_474); getitem_474 = None + getitem_475: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_67[(Ellipsis, slice(7168, None, None))]; output_parallel_67 = None + x_185: "bf16[s72, 7168][7168, 1]cuda:0" = silu_16 * getitem_475; silu_16 = getitem_475 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_68: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_185, l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, None); x_185 = l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_102: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_68, group_name = 'tp:0'); output_parallel_68 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_34: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_186: "f32[s72, 4096][4096, 1]cuda:0" = output_102.to(torch.float32); output_102 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_187: "f32[s72, 4096][4096, 1]cuda:0" = x_186 + residual_32; x_186 = residual_32 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_33: "bf16[s72, 4096][4096, 1]cuda:0" = x_187.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_35: "f32[s72, 4096][4096, 1]cuda:0" = x_187.pow(2) + variance_34: "f32[s72, 1][1, 1]cuda:0" = pow_35.mean(dim = -1, keepdim = True); pow_35 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_103: "f32[s72, 1][1, 1]cuda:0" = variance_34 + 1e-05; variance_34 = None + rsqrt_34: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_103); add_103 = None + x_188: "f32[s72, 4096][4096, 1]cuda:0" = x_187 * rsqrt_34; x_187 = rsqrt_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_189: "bf16[s72, 4096][4096, 1]cuda:0" = x_188.to(torch.bfloat16); x_188 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_190: "bf16[s72, 4096][4096, 1]cuda:0" = x_189 * _get_data_attr_34; x_189 = _get_data_attr_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_69: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_190, l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_190 = l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_17 = output_parallel_69.split([2048, 512, 512], dim = -1); output_parallel_69 = None + q_17: "bf16[s72, 2048][3072, 1]cuda:0" = split_17[0] + k_17: "bf16[s72, 512][3072, 1]cuda:0" = split_17[1] + v_17: "bf16[s72, 512][3072, 1]cuda:0" = split_17[2]; split_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_17: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_17: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_17); positions_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_51 = cos_sin_17.chunk(2, dim = -1); cos_sin_17 = None + cos_51: "bf16[s72, 64][128, 1]cuda:0" = chunk_51[0] + sin_51: "bf16[s72, 64][128, 1]cuda:0" = chunk_51[1]; chunk_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_121 = q_17.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_51: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_17.view(s72, -1, 128); q_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_17: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_51[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_17: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_51[(Ellipsis, slice(128, None, None))]; query_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_69: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_51.unsqueeze(-2) + cos_52: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_69.to(torch.bfloat16); unsqueeze_69 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_70: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_51.unsqueeze(-2) + sin_52: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_70.to(torch.bfloat16); unsqueeze_70 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_52 = torch.chunk(query_rot_17, 2, dim = -1); query_rot_17 = None + x1_34: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_52[0] + x2_34: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_52[1]; chunk_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_226: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_34 * cos_52 + mul_227: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_34 * sin_52 + o1_34: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_226 - mul_227; mul_226 = mul_227 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_228: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_34 * cos_52; x2_34 = cos_52 = None + mul_229: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_34 * sin_52; x1_34 = sin_52 = None + o2_34: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_228 + mul_229; mul_228 = mul_229 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_103: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_34, o2_34), dim = -1); o1_34 = o2_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_69: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_103, query_pass_17), dim = -1); output_103 = query_pass_17 = None + query_52: "bf16[s72, 2048][2048, 1]cuda:0" = cat_69.reshape(size_121); cat_69 = size_121 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_122 = k_17.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_51: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_17.view(s72, -1, 128); k_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_17: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_51[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_17: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_51[(Ellipsis, slice(128, None, None))]; key_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_71: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_51.unsqueeze(-2); cos_51 = None + cos_53: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_71.to(torch.bfloat16); unsqueeze_71 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_72: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_51.unsqueeze(-2); sin_51 = None + sin_53: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_72.to(torch.bfloat16); unsqueeze_72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_53 = torch.chunk(key_rot_17, 2, dim = -1); key_rot_17 = None + x1_35: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_53[0] + x2_35: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_53[1]; chunk_53 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_230: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_35 * cos_53 + mul_231: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_35 * sin_53 + o1_35: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_230 - mul_231; mul_230 = mul_231 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_232: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_35 * cos_53; x2_35 = cos_53 = None + mul_233: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_35 * sin_53; x1_35 = sin_53 = None + o2_35: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_232 + mul_233; mul_232 = mul_233 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_104: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_35, o2_35), dim = -1); o1_35 = o2_35 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_71: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_104, key_pass_17), dim = -1); output_104 = key_pass_17 = None + key_52: "bf16[s72, 512][512, 1]cuda:0" = cat_71.reshape(size_122); cat_71 = size_122 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_123 = query_52.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_105: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_123, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_123 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_53: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_52.view(-1, 16, 128); query_52 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_106: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_105.view(-1, 16, 128); output_105 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_53: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_52.view(-1, 4, 128); key_52 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_17: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_17.view(-1, 4, 128); v_17 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_17 = torch.ops.vllm.unified_attention_with_output(query_53, key_53, value_17, output_106, 'model.layers.17.self_attn.attn'); query_53 = key_53 = value_17 = unified_attention_with_output_17 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_17: "bf16[s72, 2048][2048, 1]cuda:0" = output_106.view(-1, 2048); output_106 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_70: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_17, l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_17 = l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_107: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_70, group_name = 'tp:0'); output_parallel_70 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_35: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_191: "f32[s72, 4096][4096, 1]cuda:0" = output_107.to(torch.float32); output_107 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_192: "f32[s72, 4096][4096, 1]cuda:0" = x_191 + residual_33; x_191 = residual_33 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_34: "bf16[s72, 4096][4096, 1]cuda:0" = x_192.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_36: "f32[s72, 4096][4096, 1]cuda:0" = x_192.pow(2) + variance_35: "f32[s72, 1][1, 1]cuda:0" = pow_36.mean(dim = -1, keepdim = True); pow_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_107: "f32[s72, 1][1, 1]cuda:0" = variance_35 + 1e-05; variance_35 = None + rsqrt_35: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_107); add_107 = None + x_193: "f32[s72, 4096][4096, 1]cuda:0" = x_192 * rsqrt_35; x_192 = rsqrt_35 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_194: "bf16[s72, 4096][4096, 1]cuda:0" = x_193.to(torch.bfloat16); x_193 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_195: "bf16[s72, 4096][4096, 1]cuda:0" = x_194 * _get_data_attr_35; x_194 = _get_data_attr_35 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_71: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_195, l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_195 = l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_502: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_71[(Ellipsis, slice(None, 7168, None))] + silu_17: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_502); getitem_502 = None + getitem_503: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_71[(Ellipsis, slice(7168, None, None))]; output_parallel_71 = None + x_196: "bf16[s72, 7168][7168, 1]cuda:0" = silu_17 * getitem_503; silu_17 = getitem_503 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_72: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_196, l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, None); x_196 = l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_108: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_72, group_name = 'tp:0'); output_parallel_72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_36: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_197: "f32[s72, 4096][4096, 1]cuda:0" = output_108.to(torch.float32); output_108 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_198: "f32[s72, 4096][4096, 1]cuda:0" = x_197 + residual_34; x_197 = residual_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_35: "bf16[s72, 4096][4096, 1]cuda:0" = x_198.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_37: "f32[s72, 4096][4096, 1]cuda:0" = x_198.pow(2) + variance_36: "f32[s72, 1][1, 1]cuda:0" = pow_37.mean(dim = -1, keepdim = True); pow_37 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_109: "f32[s72, 1][1, 1]cuda:0" = variance_36 + 1e-05; variance_36 = None + rsqrt_36: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_109); add_109 = None + x_199: "f32[s72, 4096][4096, 1]cuda:0" = x_198 * rsqrt_36; x_198 = rsqrt_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_200: "bf16[s72, 4096][4096, 1]cuda:0" = x_199.to(torch.bfloat16); x_199 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_201: "bf16[s72, 4096][4096, 1]cuda:0" = x_200 * _get_data_attr_36; x_200 = _get_data_attr_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_73: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_201, l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_201 = l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_18 = output_parallel_73.split([2048, 512, 512], dim = -1); output_parallel_73 = None + q_18: "bf16[s72, 2048][3072, 1]cuda:0" = split_18[0] + k_18: "bf16[s72, 512][3072, 1]cuda:0" = split_18[1] + v_18: "bf16[s72, 512][3072, 1]cuda:0" = split_18[2]; split_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_18: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_18: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_18); positions_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_54 = cos_sin_18.chunk(2, dim = -1); cos_sin_18 = None + cos_54: "bf16[s72, 64][128, 1]cuda:0" = chunk_54[0] + sin_54: "bf16[s72, 64][128, 1]cuda:0" = chunk_54[1]; chunk_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_128 = q_18.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_54: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_18.view(s72, -1, 128); q_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_18: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_54[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_18: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_54[(Ellipsis, slice(128, None, None))]; query_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_73: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_54.unsqueeze(-2) + cos_55: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_73.to(torch.bfloat16); unsqueeze_73 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_74: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_54.unsqueeze(-2) + sin_55: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_74.to(torch.bfloat16); unsqueeze_74 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_55 = torch.chunk(query_rot_18, 2, dim = -1); query_rot_18 = None + x1_36: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_55[0] + x2_36: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_55[1]; chunk_55 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_239: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_36 * cos_55 + mul_240: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_36 * sin_55 + o1_36: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_239 - mul_240; mul_239 = mul_240 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_241: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_36 * cos_55; x2_36 = cos_55 = None + mul_242: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_36 * sin_55; x1_36 = sin_55 = None + o2_36: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_241 + mul_242; mul_241 = mul_242 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_109: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_36, o2_36), dim = -1); o1_36 = o2_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_73: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_109, query_pass_18), dim = -1); output_109 = query_pass_18 = None + query_55: "bf16[s72, 2048][2048, 1]cuda:0" = cat_73.reshape(size_128); cat_73 = size_128 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_129 = k_18.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_54: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_18.view(s72, -1, 128); k_18 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_18: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_54[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_18: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_54[(Ellipsis, slice(128, None, None))]; key_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_75: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_54.unsqueeze(-2); cos_54 = None + cos_56: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_75.to(torch.bfloat16); unsqueeze_75 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_76: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_54.unsqueeze(-2); sin_54 = None + sin_56: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_76.to(torch.bfloat16); unsqueeze_76 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_56 = torch.chunk(key_rot_18, 2, dim = -1); key_rot_18 = None + x1_37: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_56[0] + x2_37: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_56[1]; chunk_56 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_243: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_37 * cos_56 + mul_244: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_37 * sin_56 + o1_37: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_243 - mul_244; mul_243 = mul_244 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_245: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_37 * cos_56; x2_37 = cos_56 = None + mul_246: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_37 * sin_56; x1_37 = sin_56 = None + o2_37: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_245 + mul_246; mul_245 = mul_246 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_110: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_37, o2_37), dim = -1); o1_37 = o2_37 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_75: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_110, key_pass_18), dim = -1); output_110 = key_pass_18 = None + key_55: "bf16[s72, 512][512, 1]cuda:0" = cat_75.reshape(size_129); cat_75 = size_129 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_130 = query_55.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_111: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_130, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_130 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_56: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_55.view(-1, 16, 128); query_55 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_112: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_111.view(-1, 16, 128); output_111 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_56: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_55.view(-1, 4, 128); key_55 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_18: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_18.view(-1, 4, 128); v_18 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_18 = torch.ops.vllm.unified_attention_with_output(query_56, key_56, value_18, output_112, 'model.layers.18.self_attn.attn'); query_56 = key_56 = value_18 = unified_attention_with_output_18 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_18: "bf16[s72, 2048][2048, 1]cuda:0" = output_112.view(-1, 2048); output_112 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_74: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_18, l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_18 = l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_113: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_74, group_name = 'tp:0'); output_parallel_74 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_37: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_202: "f32[s72, 4096][4096, 1]cuda:0" = output_113.to(torch.float32); output_113 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_203: "f32[s72, 4096][4096, 1]cuda:0" = x_202 + residual_35; x_202 = residual_35 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_36: "bf16[s72, 4096][4096, 1]cuda:0" = x_203.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_38: "f32[s72, 4096][4096, 1]cuda:0" = x_203.pow(2) + variance_37: "f32[s72, 1][1, 1]cuda:0" = pow_38.mean(dim = -1, keepdim = True); pow_38 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_113: "f32[s72, 1][1, 1]cuda:0" = variance_37 + 1e-05; variance_37 = None + rsqrt_37: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_113); add_113 = None + x_204: "f32[s72, 4096][4096, 1]cuda:0" = x_203 * rsqrt_37; x_203 = rsqrt_37 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_205: "bf16[s72, 4096][4096, 1]cuda:0" = x_204.to(torch.bfloat16); x_204 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_206: "bf16[s72, 4096][4096, 1]cuda:0" = x_205 * _get_data_attr_37; x_205 = _get_data_attr_37 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_75: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_206, l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_206 = l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_530: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_75[(Ellipsis, slice(None, 7168, None))] + silu_18: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_530); getitem_530 = None + getitem_531: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_75[(Ellipsis, slice(7168, None, None))]; output_parallel_75 = None + x_207: "bf16[s72, 7168][7168, 1]cuda:0" = silu_18 * getitem_531; silu_18 = getitem_531 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_76: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_207, l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_, None); x_207 = l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_114: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_76, group_name = 'tp:0'); output_parallel_76 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_38: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_208: "f32[s72, 4096][4096, 1]cuda:0" = output_114.to(torch.float32); output_114 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_209: "f32[s72, 4096][4096, 1]cuda:0" = x_208 + residual_36; x_208 = residual_36 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_37: "bf16[s72, 4096][4096, 1]cuda:0" = x_209.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_39: "f32[s72, 4096][4096, 1]cuda:0" = x_209.pow(2) + variance_38: "f32[s72, 1][1, 1]cuda:0" = pow_39.mean(dim = -1, keepdim = True); pow_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_115: "f32[s72, 1][1, 1]cuda:0" = variance_38 + 1e-05; variance_38 = None + rsqrt_38: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_115); add_115 = None + x_210: "f32[s72, 4096][4096, 1]cuda:0" = x_209 * rsqrt_38; x_209 = rsqrt_38 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_211: "bf16[s72, 4096][4096, 1]cuda:0" = x_210.to(torch.bfloat16); x_210 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_212: "bf16[s72, 4096][4096, 1]cuda:0" = x_211 * _get_data_attr_38; x_211 = _get_data_attr_38 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_77: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_212, l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_212 = l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_19 = output_parallel_77.split([2048, 512, 512], dim = -1); output_parallel_77 = None + q_19: "bf16[s72, 2048][3072, 1]cuda:0" = split_19[0] + k_19: "bf16[s72, 512][3072, 1]cuda:0" = split_19[1] + v_19: "bf16[s72, 512][3072, 1]cuda:0" = split_19[2]; split_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_19: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_19: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_19); positions_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_57 = cos_sin_19.chunk(2, dim = -1); cos_sin_19 = None + cos_57: "bf16[s72, 64][128, 1]cuda:0" = chunk_57[0] + sin_57: "bf16[s72, 64][128, 1]cuda:0" = chunk_57[1]; chunk_57 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_135 = q_19.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_57: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_19.view(s72, -1, 128); q_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_19: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_57[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_19: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_57[(Ellipsis, slice(128, None, None))]; query_57 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_77: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_57.unsqueeze(-2) + cos_58: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_77.to(torch.bfloat16); unsqueeze_77 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_78: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_57.unsqueeze(-2) + sin_58: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_78.to(torch.bfloat16); unsqueeze_78 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_58 = torch.chunk(query_rot_19, 2, dim = -1); query_rot_19 = None + x1_38: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_58[0] + x2_38: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_58[1]; chunk_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_252: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_38 * cos_58 + mul_253: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_38 * sin_58 + o1_38: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_252 - mul_253; mul_252 = mul_253 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_254: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_38 * cos_58; x2_38 = cos_58 = None + mul_255: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_38 * sin_58; x1_38 = sin_58 = None + o2_38: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_254 + mul_255; mul_254 = mul_255 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_115: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_38, o2_38), dim = -1); o1_38 = o2_38 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_77: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_115, query_pass_19), dim = -1); output_115 = query_pass_19 = None + query_58: "bf16[s72, 2048][2048, 1]cuda:0" = cat_77.reshape(size_135); cat_77 = size_135 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_136 = k_19.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_57: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_19.view(s72, -1, 128); k_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_19: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_57[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_19: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_57[(Ellipsis, slice(128, None, None))]; key_57 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_79: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_57.unsqueeze(-2); cos_57 = None + cos_59: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_79.to(torch.bfloat16); unsqueeze_79 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_80: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_57.unsqueeze(-2); sin_57 = None + sin_59: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_80.to(torch.bfloat16); unsqueeze_80 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_59 = torch.chunk(key_rot_19, 2, dim = -1); key_rot_19 = None + x1_39: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_59[0] + x2_39: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_59[1]; chunk_59 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_256: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_39 * cos_59 + mul_257: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_39 * sin_59 + o1_39: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_256 - mul_257; mul_256 = mul_257 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_258: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_39 * cos_59; x2_39 = cos_59 = None + mul_259: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_39 * sin_59; x1_39 = sin_59 = None + o2_39: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_258 + mul_259; mul_258 = mul_259 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_116: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_39, o2_39), dim = -1); o1_39 = o2_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_79: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_116, key_pass_19), dim = -1); output_116 = key_pass_19 = None + key_58: "bf16[s72, 512][512, 1]cuda:0" = cat_79.reshape(size_136); cat_79 = size_136 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_137 = query_58.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_117: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_137, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_137 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_59: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_58.view(-1, 16, 128); query_58 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_118: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_117.view(-1, 16, 128); output_117 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_59: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_58.view(-1, 4, 128); key_58 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_19: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_19.view(-1, 4, 128); v_19 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_19 = torch.ops.vllm.unified_attention_with_output(query_59, key_59, value_19, output_118, 'model.layers.19.self_attn.attn'); query_59 = key_59 = value_19 = unified_attention_with_output_19 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_19: "bf16[s72, 2048][2048, 1]cuda:0" = output_118.view(-1, 2048); output_118 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_78: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_19, l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_19 = l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_119: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_78, group_name = 'tp:0'); output_parallel_78 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_39: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_213: "f32[s72, 4096][4096, 1]cuda:0" = output_119.to(torch.float32); output_119 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_214: "f32[s72, 4096][4096, 1]cuda:0" = x_213 + residual_37; x_213 = residual_37 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_38: "bf16[s72, 4096][4096, 1]cuda:0" = x_214.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_40: "f32[s72, 4096][4096, 1]cuda:0" = x_214.pow(2) + variance_39: "f32[s72, 1][1, 1]cuda:0" = pow_40.mean(dim = -1, keepdim = True); pow_40 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_119: "f32[s72, 1][1, 1]cuda:0" = variance_39 + 1e-05; variance_39 = None + rsqrt_39: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_119); add_119 = None + x_215: "f32[s72, 4096][4096, 1]cuda:0" = x_214 * rsqrt_39; x_214 = rsqrt_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_216: "bf16[s72, 4096][4096, 1]cuda:0" = x_215.to(torch.bfloat16); x_215 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_217: "bf16[s72, 4096][4096, 1]cuda:0" = x_216 * _get_data_attr_39; x_216 = _get_data_attr_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_79: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_217, l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_217 = l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_558: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_79[(Ellipsis, slice(None, 7168, None))] + silu_19: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_558); getitem_558 = None + getitem_559: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_79[(Ellipsis, slice(7168, None, None))]; output_parallel_79 = None + x_218: "bf16[s72, 7168][7168, 1]cuda:0" = silu_19 * getitem_559; silu_19 = getitem_559 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_80: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_218, l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_, None); x_218 = l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_120: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_80, group_name = 'tp:0'); output_parallel_80 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_40: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_219: "f32[s72, 4096][4096, 1]cuda:0" = output_120.to(torch.float32); output_120 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_220: "f32[s72, 4096][4096, 1]cuda:0" = x_219 + residual_38; x_219 = residual_38 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_39: "bf16[s72, 4096][4096, 1]cuda:0" = x_220.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_41: "f32[s72, 4096][4096, 1]cuda:0" = x_220.pow(2) + variance_40: "f32[s72, 1][1, 1]cuda:0" = pow_41.mean(dim = -1, keepdim = True); pow_41 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_121: "f32[s72, 1][1, 1]cuda:0" = variance_40 + 1e-05; variance_40 = None + rsqrt_40: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_121); add_121 = None + x_221: "f32[s72, 4096][4096, 1]cuda:0" = x_220 * rsqrt_40; x_220 = rsqrt_40 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_222: "bf16[s72, 4096][4096, 1]cuda:0" = x_221.to(torch.bfloat16); x_221 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_223: "bf16[s72, 4096][4096, 1]cuda:0" = x_222 * _get_data_attr_40; x_222 = _get_data_attr_40 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_81: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_223, l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_223 = l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_20 = output_parallel_81.split([2048, 512, 512], dim = -1); output_parallel_81 = None + q_20: "bf16[s72, 2048][3072, 1]cuda:0" = split_20[0] + k_20: "bf16[s72, 512][3072, 1]cuda:0" = split_20[1] + v_20: "bf16[s72, 512][3072, 1]cuda:0" = split_20[2]; split_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_20: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_20: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_20); positions_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_60 = cos_sin_20.chunk(2, dim = -1); cos_sin_20 = None + cos_60: "bf16[s72, 64][128, 1]cuda:0" = chunk_60[0] + sin_60: "bf16[s72, 64][128, 1]cuda:0" = chunk_60[1]; chunk_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_142 = q_20.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_60: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_20.view(s72, -1, 128); q_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_20: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_60[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_20: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_60[(Ellipsis, slice(128, None, None))]; query_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_81: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_60.unsqueeze(-2) + cos_61: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_81.to(torch.bfloat16); unsqueeze_81 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_82: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_60.unsqueeze(-2) + sin_61: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_82.to(torch.bfloat16); unsqueeze_82 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_61 = torch.chunk(query_rot_20, 2, dim = -1); query_rot_20 = None + x1_40: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_61[0] + x2_40: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_61[1]; chunk_61 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_265: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_40 * cos_61 + mul_266: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_40 * sin_61 + o1_40: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_265 - mul_266; mul_265 = mul_266 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_267: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_40 * cos_61; x2_40 = cos_61 = None + mul_268: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_40 * sin_61; x1_40 = sin_61 = None + o2_40: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_267 + mul_268; mul_267 = mul_268 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_121: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_40, o2_40), dim = -1); o1_40 = o2_40 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_81: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_121, query_pass_20), dim = -1); output_121 = query_pass_20 = None + query_61: "bf16[s72, 2048][2048, 1]cuda:0" = cat_81.reshape(size_142); cat_81 = size_142 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_143 = k_20.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_60: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_20.view(s72, -1, 128); k_20 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_20: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_60[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_20: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_60[(Ellipsis, slice(128, None, None))]; key_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_83: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_60.unsqueeze(-2); cos_60 = None + cos_62: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_83.to(torch.bfloat16); unsqueeze_83 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_84: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_60.unsqueeze(-2); sin_60 = None + sin_62: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_84.to(torch.bfloat16); unsqueeze_84 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_62 = torch.chunk(key_rot_20, 2, dim = -1); key_rot_20 = None + x1_41: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_62[0] + x2_41: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_62[1]; chunk_62 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_269: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_41 * cos_62 + mul_270: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_41 * sin_62 + o1_41: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_269 - mul_270; mul_269 = mul_270 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_271: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_41 * cos_62; x2_41 = cos_62 = None + mul_272: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_41 * sin_62; x1_41 = sin_62 = None + o2_41: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_271 + mul_272; mul_271 = mul_272 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_122: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_41, o2_41), dim = -1); o1_41 = o2_41 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_83: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_122, key_pass_20), dim = -1); output_122 = key_pass_20 = None + key_61: "bf16[s72, 512][512, 1]cuda:0" = cat_83.reshape(size_143); cat_83 = size_143 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_144 = query_61.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_123: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_144, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_144 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_62: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_61.view(-1, 16, 128); query_61 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_124: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_123.view(-1, 16, 128); output_123 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_62: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_61.view(-1, 4, 128); key_61 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_20: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_20.view(-1, 4, 128); v_20 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_20 = torch.ops.vllm.unified_attention_with_output(query_62, key_62, value_20, output_124, 'model.layers.20.self_attn.attn'); query_62 = key_62 = value_20 = unified_attention_with_output_20 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_20: "bf16[s72, 2048][2048, 1]cuda:0" = output_124.view(-1, 2048); output_124 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_82: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_20, l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_20 = l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_125: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_82, group_name = 'tp:0'); output_parallel_82 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_41: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_224: "f32[s72, 4096][4096, 1]cuda:0" = output_125.to(torch.float32); output_125 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_225: "f32[s72, 4096][4096, 1]cuda:0" = x_224 + residual_39; x_224 = residual_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_40: "bf16[s72, 4096][4096, 1]cuda:0" = x_225.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_42: "f32[s72, 4096][4096, 1]cuda:0" = x_225.pow(2) + variance_41: "f32[s72, 1][1, 1]cuda:0" = pow_42.mean(dim = -1, keepdim = True); pow_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_125: "f32[s72, 1][1, 1]cuda:0" = variance_41 + 1e-05; variance_41 = None + rsqrt_41: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_125); add_125 = None + x_226: "f32[s72, 4096][4096, 1]cuda:0" = x_225 * rsqrt_41; x_225 = rsqrt_41 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_227: "bf16[s72, 4096][4096, 1]cuda:0" = x_226.to(torch.bfloat16); x_226 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_228: "bf16[s72, 4096][4096, 1]cuda:0" = x_227 * _get_data_attr_41; x_227 = _get_data_attr_41 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_83: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_228, l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_228 = l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_586: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_83[(Ellipsis, slice(None, 7168, None))] + silu_20: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_586); getitem_586 = None + getitem_587: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_83[(Ellipsis, slice(7168, None, None))]; output_parallel_83 = None + x_229: "bf16[s72, 7168][7168, 1]cuda:0" = silu_20 * getitem_587; silu_20 = getitem_587 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_84: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_229, l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_, None); x_229 = l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_126: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_84, group_name = 'tp:0'); output_parallel_84 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_42: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_230: "f32[s72, 4096][4096, 1]cuda:0" = output_126.to(torch.float32); output_126 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_231: "f32[s72, 4096][4096, 1]cuda:0" = x_230 + residual_40; x_230 = residual_40 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_41: "bf16[s72, 4096][4096, 1]cuda:0" = x_231.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_43: "f32[s72, 4096][4096, 1]cuda:0" = x_231.pow(2) + variance_42: "f32[s72, 1][1, 1]cuda:0" = pow_43.mean(dim = -1, keepdim = True); pow_43 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_127: "f32[s72, 1][1, 1]cuda:0" = variance_42 + 1e-05; variance_42 = None + rsqrt_42: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_127); add_127 = None + x_232: "f32[s72, 4096][4096, 1]cuda:0" = x_231 * rsqrt_42; x_231 = rsqrt_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_233: "bf16[s72, 4096][4096, 1]cuda:0" = x_232.to(torch.bfloat16); x_232 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_234: "bf16[s72, 4096][4096, 1]cuda:0" = x_233 * _get_data_attr_42; x_233 = _get_data_attr_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_85: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_234, l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_234 = l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_21 = output_parallel_85.split([2048, 512, 512], dim = -1); output_parallel_85 = None + q_21: "bf16[s72, 2048][3072, 1]cuda:0" = split_21[0] + k_21: "bf16[s72, 512][3072, 1]cuda:0" = split_21[1] + v_21: "bf16[s72, 512][3072, 1]cuda:0" = split_21[2]; split_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_21: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_21: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_21); positions_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_63 = cos_sin_21.chunk(2, dim = -1); cos_sin_21 = None + cos_63: "bf16[s72, 64][128, 1]cuda:0" = chunk_63[0] + sin_63: "bf16[s72, 64][128, 1]cuda:0" = chunk_63[1]; chunk_63 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_149 = q_21.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_63: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_21.view(s72, -1, 128); q_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_21: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_63[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_21: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_63[(Ellipsis, slice(128, None, None))]; query_63 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_85: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_63.unsqueeze(-2) + cos_64: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_85.to(torch.bfloat16); unsqueeze_85 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_86: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_63.unsqueeze(-2) + sin_64: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_86.to(torch.bfloat16); unsqueeze_86 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_64 = torch.chunk(query_rot_21, 2, dim = -1); query_rot_21 = None + x1_42: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_64[0] + x2_42: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_64[1]; chunk_64 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_278: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_42 * cos_64 + mul_279: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_42 * sin_64 + o1_42: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_278 - mul_279; mul_278 = mul_279 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_280: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_42 * cos_64; x2_42 = cos_64 = None + mul_281: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_42 * sin_64; x1_42 = sin_64 = None + o2_42: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_280 + mul_281; mul_280 = mul_281 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_127: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_42, o2_42), dim = -1); o1_42 = o2_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_85: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_127, query_pass_21), dim = -1); output_127 = query_pass_21 = None + query_64: "bf16[s72, 2048][2048, 1]cuda:0" = cat_85.reshape(size_149); cat_85 = size_149 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_150 = k_21.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_63: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_21.view(s72, -1, 128); k_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_21: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_63[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_21: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_63[(Ellipsis, slice(128, None, None))]; key_63 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_87: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_63.unsqueeze(-2); cos_63 = None + cos_65: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_87.to(torch.bfloat16); unsqueeze_87 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_88: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_63.unsqueeze(-2); sin_63 = None + sin_65: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_88.to(torch.bfloat16); unsqueeze_88 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_65 = torch.chunk(key_rot_21, 2, dim = -1); key_rot_21 = None + x1_43: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_65[0] + x2_43: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_65[1]; chunk_65 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_282: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_43 * cos_65 + mul_283: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_43 * sin_65 + o1_43: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_282 - mul_283; mul_282 = mul_283 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_284: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_43 * cos_65; x2_43 = cos_65 = None + mul_285: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_43 * sin_65; x1_43 = sin_65 = None + o2_43: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_284 + mul_285; mul_284 = mul_285 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_128: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_43, o2_43), dim = -1); o1_43 = o2_43 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_87: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_128, key_pass_21), dim = -1); output_128 = key_pass_21 = None + key_64: "bf16[s72, 512][512, 1]cuda:0" = cat_87.reshape(size_150); cat_87 = size_150 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_151 = query_64.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_129: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_151, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_151 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_65: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_64.view(-1, 16, 128); query_64 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_130: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_129.view(-1, 16, 128); output_129 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_65: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_64.view(-1, 4, 128); key_64 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_21: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_21.view(-1, 4, 128); v_21 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_21 = torch.ops.vllm.unified_attention_with_output(query_65, key_65, value_21, output_130, 'model.layers.21.self_attn.attn'); query_65 = key_65 = value_21 = unified_attention_with_output_21 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_21: "bf16[s72, 2048][2048, 1]cuda:0" = output_130.view(-1, 2048); output_130 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_86: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_21, l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_21 = l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_131: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_86, group_name = 'tp:0'); output_parallel_86 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_43: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_235: "f32[s72, 4096][4096, 1]cuda:0" = output_131.to(torch.float32); output_131 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_236: "f32[s72, 4096][4096, 1]cuda:0" = x_235 + residual_41; x_235 = residual_41 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_42: "bf16[s72, 4096][4096, 1]cuda:0" = x_236.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_44: "f32[s72, 4096][4096, 1]cuda:0" = x_236.pow(2) + variance_43: "f32[s72, 1][1, 1]cuda:0" = pow_44.mean(dim = -1, keepdim = True); pow_44 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_131: "f32[s72, 1][1, 1]cuda:0" = variance_43 + 1e-05; variance_43 = None + rsqrt_43: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_131); add_131 = None + x_237: "f32[s72, 4096][4096, 1]cuda:0" = x_236 * rsqrt_43; x_236 = rsqrt_43 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_238: "bf16[s72, 4096][4096, 1]cuda:0" = x_237.to(torch.bfloat16); x_237 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_239: "bf16[s72, 4096][4096, 1]cuda:0" = x_238 * _get_data_attr_43; x_238 = _get_data_attr_43 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_87: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_239, l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_239 = l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_614: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_87[(Ellipsis, slice(None, 7168, None))] + silu_21: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_614); getitem_614 = None + getitem_615: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_87[(Ellipsis, slice(7168, None, None))]; output_parallel_87 = None + x_240: "bf16[s72, 7168][7168, 1]cuda:0" = silu_21 * getitem_615; silu_21 = getitem_615 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_88: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_240, l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_, None); x_240 = l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_132: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_88, group_name = 'tp:0'); output_parallel_88 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_44: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_241: "f32[s72, 4096][4096, 1]cuda:0" = output_132.to(torch.float32); output_132 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_242: "f32[s72, 4096][4096, 1]cuda:0" = x_241 + residual_42; x_241 = residual_42 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_43: "bf16[s72, 4096][4096, 1]cuda:0" = x_242.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_45: "f32[s72, 4096][4096, 1]cuda:0" = x_242.pow(2) + variance_44: "f32[s72, 1][1, 1]cuda:0" = pow_45.mean(dim = -1, keepdim = True); pow_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_133: "f32[s72, 1][1, 1]cuda:0" = variance_44 + 1e-05; variance_44 = None + rsqrt_44: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_133); add_133 = None + x_243: "f32[s72, 4096][4096, 1]cuda:0" = x_242 * rsqrt_44; x_242 = rsqrt_44 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_244: "bf16[s72, 4096][4096, 1]cuda:0" = x_243.to(torch.bfloat16); x_243 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_245: "bf16[s72, 4096][4096, 1]cuda:0" = x_244 * _get_data_attr_44; x_244 = _get_data_attr_44 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_89: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_245, l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_245 = l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_22 = output_parallel_89.split([2048, 512, 512], dim = -1); output_parallel_89 = None + q_22: "bf16[s72, 2048][3072, 1]cuda:0" = split_22[0] + k_22: "bf16[s72, 512][3072, 1]cuda:0" = split_22[1] + v_22: "bf16[s72, 512][3072, 1]cuda:0" = split_22[2]; split_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_22: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_22: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_22); positions_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_66 = cos_sin_22.chunk(2, dim = -1); cos_sin_22 = None + cos_66: "bf16[s72, 64][128, 1]cuda:0" = chunk_66[0] + sin_66: "bf16[s72, 64][128, 1]cuda:0" = chunk_66[1]; chunk_66 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_156 = q_22.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_66: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_22.view(s72, -1, 128); q_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_22: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_66[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_22: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_66[(Ellipsis, slice(128, None, None))]; query_66 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_89: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_66.unsqueeze(-2) + cos_67: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_89.to(torch.bfloat16); unsqueeze_89 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_90: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_66.unsqueeze(-2) + sin_67: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_90.to(torch.bfloat16); unsqueeze_90 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_67 = torch.chunk(query_rot_22, 2, dim = -1); query_rot_22 = None + x1_44: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_67[0] + x2_44: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_67[1]; chunk_67 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_291: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_44 * cos_67 + mul_292: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_44 * sin_67 + o1_44: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_291 - mul_292; mul_291 = mul_292 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_293: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_44 * cos_67; x2_44 = cos_67 = None + mul_294: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_44 * sin_67; x1_44 = sin_67 = None + o2_44: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_293 + mul_294; mul_293 = mul_294 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_133: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_44, o2_44), dim = -1); o1_44 = o2_44 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_89: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_133, query_pass_22), dim = -1); output_133 = query_pass_22 = None + query_67: "bf16[s72, 2048][2048, 1]cuda:0" = cat_89.reshape(size_156); cat_89 = size_156 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_157 = k_22.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_66: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_22.view(s72, -1, 128); k_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_22: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_66[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_22: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_66[(Ellipsis, slice(128, None, None))]; key_66 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_91: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_66.unsqueeze(-2); cos_66 = None + cos_68: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_91.to(torch.bfloat16); unsqueeze_91 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_92: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_66.unsqueeze(-2); sin_66 = None + sin_68: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_92.to(torch.bfloat16); unsqueeze_92 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_68 = torch.chunk(key_rot_22, 2, dim = -1); key_rot_22 = None + x1_45: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_68[0] + x2_45: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_68[1]; chunk_68 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_295: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_45 * cos_68 + mul_296: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_45 * sin_68 + o1_45: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_295 - mul_296; mul_295 = mul_296 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_297: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_45 * cos_68; x2_45 = cos_68 = None + mul_298: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_45 * sin_68; x1_45 = sin_68 = None + o2_45: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_297 + mul_298; mul_297 = mul_298 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_134: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_45, o2_45), dim = -1); o1_45 = o2_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_91: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_134, key_pass_22), dim = -1); output_134 = key_pass_22 = None + key_67: "bf16[s72, 512][512, 1]cuda:0" = cat_91.reshape(size_157); cat_91 = size_157 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_158 = query_67.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_135: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_158, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_158 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_68: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_67.view(-1, 16, 128); query_67 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_136: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_135.view(-1, 16, 128); output_135 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_68: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_67.view(-1, 4, 128); key_67 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_22: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_22.view(-1, 4, 128); v_22 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_22 = torch.ops.vllm.unified_attention_with_output(query_68, key_68, value_22, output_136, 'model.layers.22.self_attn.attn'); query_68 = key_68 = value_22 = unified_attention_with_output_22 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_22: "bf16[s72, 2048][2048, 1]cuda:0" = output_136.view(-1, 2048); output_136 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_90: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_22, l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_22 = l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_137: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_90, group_name = 'tp:0'); output_parallel_90 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_45: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_246: "f32[s72, 4096][4096, 1]cuda:0" = output_137.to(torch.float32); output_137 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_247: "f32[s72, 4096][4096, 1]cuda:0" = x_246 + residual_43; x_246 = residual_43 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_44: "bf16[s72, 4096][4096, 1]cuda:0" = x_247.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_46: "f32[s72, 4096][4096, 1]cuda:0" = x_247.pow(2) + variance_45: "f32[s72, 1][1, 1]cuda:0" = pow_46.mean(dim = -1, keepdim = True); pow_46 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_137: "f32[s72, 1][1, 1]cuda:0" = variance_45 + 1e-05; variance_45 = None + rsqrt_45: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_137); add_137 = None + x_248: "f32[s72, 4096][4096, 1]cuda:0" = x_247 * rsqrt_45; x_247 = rsqrt_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_249: "bf16[s72, 4096][4096, 1]cuda:0" = x_248.to(torch.bfloat16); x_248 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_250: "bf16[s72, 4096][4096, 1]cuda:0" = x_249 * _get_data_attr_45; x_249 = _get_data_attr_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_91: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_250, l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_250 = l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_642: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_91[(Ellipsis, slice(None, 7168, None))] + silu_22: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_642); getitem_642 = None + getitem_643: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_91[(Ellipsis, slice(7168, None, None))]; output_parallel_91 = None + x_251: "bf16[s72, 7168][7168, 1]cuda:0" = silu_22 * getitem_643; silu_22 = getitem_643 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_92: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_251, l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_, None); x_251 = l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_138: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_92, group_name = 'tp:0'); output_parallel_92 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_46: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_252: "f32[s72, 4096][4096, 1]cuda:0" = output_138.to(torch.float32); output_138 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_253: "f32[s72, 4096][4096, 1]cuda:0" = x_252 + residual_44; x_252 = residual_44 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_45: "bf16[s72, 4096][4096, 1]cuda:0" = x_253.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_47: "f32[s72, 4096][4096, 1]cuda:0" = x_253.pow(2) + variance_46: "f32[s72, 1][1, 1]cuda:0" = pow_47.mean(dim = -1, keepdim = True); pow_47 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_139: "f32[s72, 1][1, 1]cuda:0" = variance_46 + 1e-05; variance_46 = None + rsqrt_46: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_139); add_139 = None + x_254: "f32[s72, 4096][4096, 1]cuda:0" = x_253 * rsqrt_46; x_253 = rsqrt_46 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_255: "bf16[s72, 4096][4096, 1]cuda:0" = x_254.to(torch.bfloat16); x_254 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_256: "bf16[s72, 4096][4096, 1]cuda:0" = x_255 * _get_data_attr_46; x_255 = _get_data_attr_46 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_93: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_256, l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_256 = l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_23 = output_parallel_93.split([2048, 512, 512], dim = -1); output_parallel_93 = None + q_23: "bf16[s72, 2048][3072, 1]cuda:0" = split_23[0] + k_23: "bf16[s72, 512][3072, 1]cuda:0" = split_23[1] + v_23: "bf16[s72, 512][3072, 1]cuda:0" = split_23[2]; split_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_23: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_23: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_23); positions_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_69 = cos_sin_23.chunk(2, dim = -1); cos_sin_23 = None + cos_69: "bf16[s72, 64][128, 1]cuda:0" = chunk_69[0] + sin_69: "bf16[s72, 64][128, 1]cuda:0" = chunk_69[1]; chunk_69 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_163 = q_23.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_69: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_23.view(s72, -1, 128); q_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_23: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_69[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_23: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_69[(Ellipsis, slice(128, None, None))]; query_69 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_93: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_69.unsqueeze(-2) + cos_70: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_93.to(torch.bfloat16); unsqueeze_93 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_94: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_69.unsqueeze(-2) + sin_70: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_94.to(torch.bfloat16); unsqueeze_94 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_70 = torch.chunk(query_rot_23, 2, dim = -1); query_rot_23 = None + x1_46: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_70[0] + x2_46: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_70[1]; chunk_70 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_304: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_46 * cos_70 + mul_305: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_46 * sin_70 + o1_46: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_304 - mul_305; mul_304 = mul_305 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_306: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_46 * cos_70; x2_46 = cos_70 = None + mul_307: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_46 * sin_70; x1_46 = sin_70 = None + o2_46: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_306 + mul_307; mul_306 = mul_307 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_139: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_46, o2_46), dim = -1); o1_46 = o2_46 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_93: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_139, query_pass_23), dim = -1); output_139 = query_pass_23 = None + query_70: "bf16[s72, 2048][2048, 1]cuda:0" = cat_93.reshape(size_163); cat_93 = size_163 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_164 = k_23.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_69: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_23.view(s72, -1, 128); k_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_23: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_69[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_23: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_69[(Ellipsis, slice(128, None, None))]; key_69 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_95: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_69.unsqueeze(-2); cos_69 = None + cos_71: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_95.to(torch.bfloat16); unsqueeze_95 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_96: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_69.unsqueeze(-2); sin_69 = None + sin_71: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_96.to(torch.bfloat16); unsqueeze_96 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_71 = torch.chunk(key_rot_23, 2, dim = -1); key_rot_23 = None + x1_47: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_71[0] + x2_47: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_71[1]; chunk_71 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_308: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_47 * cos_71 + mul_309: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_47 * sin_71 + o1_47: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_308 - mul_309; mul_308 = mul_309 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_310: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_47 * cos_71; x2_47 = cos_71 = None + mul_311: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_47 * sin_71; x1_47 = sin_71 = None + o2_47: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_310 + mul_311; mul_310 = mul_311 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_140: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_47, o2_47), dim = -1); o1_47 = o2_47 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_95: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_140, key_pass_23), dim = -1); output_140 = key_pass_23 = None + key_70: "bf16[s72, 512][512, 1]cuda:0" = cat_95.reshape(size_164); cat_95 = size_164 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_165 = query_70.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_141: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_165, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_165 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_71: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_70.view(-1, 16, 128); query_70 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_142: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_141.view(-1, 16, 128); output_141 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_71: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_70.view(-1, 4, 128); key_70 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_23: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_23.view(-1, 4, 128); v_23 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_23 = torch.ops.vllm.unified_attention_with_output(query_71, key_71, value_23, output_142, 'model.layers.23.self_attn.attn'); query_71 = key_71 = value_23 = unified_attention_with_output_23 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_23: "bf16[s72, 2048][2048, 1]cuda:0" = output_142.view(-1, 2048); output_142 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_94: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_23, l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_23 = l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_143: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_94, group_name = 'tp:0'); output_parallel_94 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_47: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_257: "f32[s72, 4096][4096, 1]cuda:0" = output_143.to(torch.float32); output_143 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_258: "f32[s72, 4096][4096, 1]cuda:0" = x_257 + residual_45; x_257 = residual_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_46: "bf16[s72, 4096][4096, 1]cuda:0" = x_258.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_48: "f32[s72, 4096][4096, 1]cuda:0" = x_258.pow(2) + variance_47: "f32[s72, 1][1, 1]cuda:0" = pow_48.mean(dim = -1, keepdim = True); pow_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_143: "f32[s72, 1][1, 1]cuda:0" = variance_47 + 1e-05; variance_47 = None + rsqrt_47: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_143); add_143 = None + x_259: "f32[s72, 4096][4096, 1]cuda:0" = x_258 * rsqrt_47; x_258 = rsqrt_47 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_260: "bf16[s72, 4096][4096, 1]cuda:0" = x_259.to(torch.bfloat16); x_259 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_261: "bf16[s72, 4096][4096, 1]cuda:0" = x_260 * _get_data_attr_47; x_260 = _get_data_attr_47 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_95: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_261, l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_261 = l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_670: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_95[(Ellipsis, slice(None, 7168, None))] + silu_23: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_670); getitem_670 = None + getitem_671: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_95[(Ellipsis, slice(7168, None, None))]; output_parallel_95 = None + x_262: "bf16[s72, 7168][7168, 1]cuda:0" = silu_23 * getitem_671; silu_23 = getitem_671 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_96: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_262, l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_, None); x_262 = l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_144: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_96, group_name = 'tp:0'); output_parallel_96 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_48: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_263: "f32[s72, 4096][4096, 1]cuda:0" = output_144.to(torch.float32); output_144 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_264: "f32[s72, 4096][4096, 1]cuda:0" = x_263 + residual_46; x_263 = residual_46 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_47: "bf16[s72, 4096][4096, 1]cuda:0" = x_264.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_49: "f32[s72, 4096][4096, 1]cuda:0" = x_264.pow(2) + variance_48: "f32[s72, 1][1, 1]cuda:0" = pow_49.mean(dim = -1, keepdim = True); pow_49 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_145: "f32[s72, 1][1, 1]cuda:0" = variance_48 + 1e-05; variance_48 = None + rsqrt_48: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_145); add_145 = None + x_265: "f32[s72, 4096][4096, 1]cuda:0" = x_264 * rsqrt_48; x_264 = rsqrt_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_266: "bf16[s72, 4096][4096, 1]cuda:0" = x_265.to(torch.bfloat16); x_265 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_267: "bf16[s72, 4096][4096, 1]cuda:0" = x_266 * _get_data_attr_48; x_266 = _get_data_attr_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_97: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_267, l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_267 = l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_24 = output_parallel_97.split([2048, 512, 512], dim = -1); output_parallel_97 = None + q_24: "bf16[s72, 2048][3072, 1]cuda:0" = split_24[0] + k_24: "bf16[s72, 512][3072, 1]cuda:0" = split_24[1] + v_24: "bf16[s72, 512][3072, 1]cuda:0" = split_24[2]; split_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_24: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_24: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_24); positions_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_72 = cos_sin_24.chunk(2, dim = -1); cos_sin_24 = None + cos_72: "bf16[s72, 64][128, 1]cuda:0" = chunk_72[0] + sin_72: "bf16[s72, 64][128, 1]cuda:0" = chunk_72[1]; chunk_72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_170 = q_24.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_72: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_24.view(s72, -1, 128); q_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_24: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_72[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_24: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_72[(Ellipsis, slice(128, None, None))]; query_72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_97: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_72.unsqueeze(-2) + cos_73: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_97.to(torch.bfloat16); unsqueeze_97 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_98: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_72.unsqueeze(-2) + sin_73: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_98.to(torch.bfloat16); unsqueeze_98 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_73 = torch.chunk(query_rot_24, 2, dim = -1); query_rot_24 = None + x1_48: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_73[0] + x2_48: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_73[1]; chunk_73 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_317: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_48 * cos_73 + mul_318: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_48 * sin_73 + o1_48: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_317 - mul_318; mul_317 = mul_318 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_319: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_48 * cos_73; x2_48 = cos_73 = None + mul_320: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_48 * sin_73; x1_48 = sin_73 = None + o2_48: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_319 + mul_320; mul_319 = mul_320 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_145: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_48, o2_48), dim = -1); o1_48 = o2_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_97: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_145, query_pass_24), dim = -1); output_145 = query_pass_24 = None + query_73: "bf16[s72, 2048][2048, 1]cuda:0" = cat_97.reshape(size_170); cat_97 = size_170 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_171 = k_24.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_72: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_24.view(s72, -1, 128); k_24 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_24: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_72[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_24: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_72[(Ellipsis, slice(128, None, None))]; key_72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_99: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_72.unsqueeze(-2); cos_72 = None + cos_74: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_99.to(torch.bfloat16); unsqueeze_99 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_100: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_72.unsqueeze(-2); sin_72 = None + sin_74: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_100.to(torch.bfloat16); unsqueeze_100 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_74 = torch.chunk(key_rot_24, 2, dim = -1); key_rot_24 = None + x1_49: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_74[0] + x2_49: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_74[1]; chunk_74 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_321: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_49 * cos_74 + mul_322: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_49 * sin_74 + o1_49: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_321 - mul_322; mul_321 = mul_322 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_323: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_49 * cos_74; x2_49 = cos_74 = None + mul_324: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_49 * sin_74; x1_49 = sin_74 = None + o2_49: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_323 + mul_324; mul_323 = mul_324 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_146: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_49, o2_49), dim = -1); o1_49 = o2_49 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_99: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_146, key_pass_24), dim = -1); output_146 = key_pass_24 = None + key_73: "bf16[s72, 512][512, 1]cuda:0" = cat_99.reshape(size_171); cat_99 = size_171 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_172 = query_73.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_147: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_172, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_172 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_74: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_73.view(-1, 16, 128); query_73 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_148: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_147.view(-1, 16, 128); output_147 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_74: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_73.view(-1, 4, 128); key_73 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_24: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_24.view(-1, 4, 128); v_24 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_24 = torch.ops.vllm.unified_attention_with_output(query_74, key_74, value_24, output_148, 'model.layers.24.self_attn.attn'); query_74 = key_74 = value_24 = unified_attention_with_output_24 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_24: "bf16[s72, 2048][2048, 1]cuda:0" = output_148.view(-1, 2048); output_148 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_98: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_24, l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_24 = l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_149: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_98, group_name = 'tp:0'); output_parallel_98 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_49: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_268: "f32[s72, 4096][4096, 1]cuda:0" = output_149.to(torch.float32); output_149 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_269: "f32[s72, 4096][4096, 1]cuda:0" = x_268 + residual_47; x_268 = residual_47 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_48: "bf16[s72, 4096][4096, 1]cuda:0" = x_269.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_50: "f32[s72, 4096][4096, 1]cuda:0" = x_269.pow(2) + variance_49: "f32[s72, 1][1, 1]cuda:0" = pow_50.mean(dim = -1, keepdim = True); pow_50 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_149: "f32[s72, 1][1, 1]cuda:0" = variance_49 + 1e-05; variance_49 = None + rsqrt_49: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_149); add_149 = None + x_270: "f32[s72, 4096][4096, 1]cuda:0" = x_269 * rsqrt_49; x_269 = rsqrt_49 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_271: "bf16[s72, 4096][4096, 1]cuda:0" = x_270.to(torch.bfloat16); x_270 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_272: "bf16[s72, 4096][4096, 1]cuda:0" = x_271 * _get_data_attr_49; x_271 = _get_data_attr_49 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_99: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_272, l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_272 = l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_698: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_99[(Ellipsis, slice(None, 7168, None))] + silu_24: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_698); getitem_698 = None + getitem_699: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_99[(Ellipsis, slice(7168, None, None))]; output_parallel_99 = None + x_273: "bf16[s72, 7168][7168, 1]cuda:0" = silu_24 * getitem_699; silu_24 = getitem_699 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_100: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_273, l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_, None); x_273 = l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_150: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_100, group_name = 'tp:0'); output_parallel_100 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_50: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_274: "f32[s72, 4096][4096, 1]cuda:0" = output_150.to(torch.float32); output_150 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_275: "f32[s72, 4096][4096, 1]cuda:0" = x_274 + residual_48; x_274 = residual_48 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_49: "bf16[s72, 4096][4096, 1]cuda:0" = x_275.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_51: "f32[s72, 4096][4096, 1]cuda:0" = x_275.pow(2) + variance_50: "f32[s72, 1][1, 1]cuda:0" = pow_51.mean(dim = -1, keepdim = True); pow_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_151: "f32[s72, 1][1, 1]cuda:0" = variance_50 + 1e-05; variance_50 = None + rsqrt_50: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_151); add_151 = None + x_276: "f32[s72, 4096][4096, 1]cuda:0" = x_275 * rsqrt_50; x_275 = rsqrt_50 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_277: "bf16[s72, 4096][4096, 1]cuda:0" = x_276.to(torch.bfloat16); x_276 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_278: "bf16[s72, 4096][4096, 1]cuda:0" = x_277 * _get_data_attr_50; x_277 = _get_data_attr_50 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_101: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_278, l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_278 = l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_25 = output_parallel_101.split([2048, 512, 512], dim = -1); output_parallel_101 = None + q_25: "bf16[s72, 2048][3072, 1]cuda:0" = split_25[0] + k_25: "bf16[s72, 512][3072, 1]cuda:0" = split_25[1] + v_25: "bf16[s72, 512][3072, 1]cuda:0" = split_25[2]; split_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_25: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_25: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_25); positions_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_75 = cos_sin_25.chunk(2, dim = -1); cos_sin_25 = None + cos_75: "bf16[s72, 64][128, 1]cuda:0" = chunk_75[0] + sin_75: "bf16[s72, 64][128, 1]cuda:0" = chunk_75[1]; chunk_75 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_177 = q_25.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_75: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_25.view(s72, -1, 128); q_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_25: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_75[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_25: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_75[(Ellipsis, slice(128, None, None))]; query_75 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_101: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_75.unsqueeze(-2) + cos_76: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_101.to(torch.bfloat16); unsqueeze_101 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_102: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_75.unsqueeze(-2) + sin_76: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_102.to(torch.bfloat16); unsqueeze_102 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_76 = torch.chunk(query_rot_25, 2, dim = -1); query_rot_25 = None + x1_50: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_76[0] + x2_50: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_76[1]; chunk_76 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_330: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_50 * cos_76 + mul_331: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_50 * sin_76 + o1_50: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_330 - mul_331; mul_330 = mul_331 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_332: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_50 * cos_76; x2_50 = cos_76 = None + mul_333: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_50 * sin_76; x1_50 = sin_76 = None + o2_50: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_332 + mul_333; mul_332 = mul_333 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_151: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_50, o2_50), dim = -1); o1_50 = o2_50 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_101: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_151, query_pass_25), dim = -1); output_151 = query_pass_25 = None + query_76: "bf16[s72, 2048][2048, 1]cuda:0" = cat_101.reshape(size_177); cat_101 = size_177 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_178 = k_25.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_75: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_25.view(s72, -1, 128); k_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_25: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_75[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_25: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_75[(Ellipsis, slice(128, None, None))]; key_75 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_103: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_75.unsqueeze(-2); cos_75 = None + cos_77: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_103.to(torch.bfloat16); unsqueeze_103 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_104: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_75.unsqueeze(-2); sin_75 = None + sin_77: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_104.to(torch.bfloat16); unsqueeze_104 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_77 = torch.chunk(key_rot_25, 2, dim = -1); key_rot_25 = None + x1_51: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_77[0] + x2_51: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_77[1]; chunk_77 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_334: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_51 * cos_77 + mul_335: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_51 * sin_77 + o1_51: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_334 - mul_335; mul_334 = mul_335 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_336: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_51 * cos_77; x2_51 = cos_77 = None + mul_337: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_51 * sin_77; x1_51 = sin_77 = None + o2_51: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_336 + mul_337; mul_336 = mul_337 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_152: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_51, o2_51), dim = -1); o1_51 = o2_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_103: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_152, key_pass_25), dim = -1); output_152 = key_pass_25 = None + key_76: "bf16[s72, 512][512, 1]cuda:0" = cat_103.reshape(size_178); cat_103 = size_178 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_179 = query_76.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_153: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_179, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_179 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_77: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_76.view(-1, 16, 128); query_76 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_154: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_153.view(-1, 16, 128); output_153 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_77: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_76.view(-1, 4, 128); key_76 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_25: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_25.view(-1, 4, 128); v_25 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_25 = torch.ops.vllm.unified_attention_with_output(query_77, key_77, value_25, output_154, 'model.layers.25.self_attn.attn'); query_77 = key_77 = value_25 = unified_attention_with_output_25 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_25: "bf16[s72, 2048][2048, 1]cuda:0" = output_154.view(-1, 2048); output_154 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_102: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_25, l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_25 = l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_155: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_102, group_name = 'tp:0'); output_parallel_102 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_51: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_279: "f32[s72, 4096][4096, 1]cuda:0" = output_155.to(torch.float32); output_155 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_280: "f32[s72, 4096][4096, 1]cuda:0" = x_279 + residual_49; x_279 = residual_49 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_50: "bf16[s72, 4096][4096, 1]cuda:0" = x_280.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_52: "f32[s72, 4096][4096, 1]cuda:0" = x_280.pow(2) + variance_51: "f32[s72, 1][1, 1]cuda:0" = pow_52.mean(dim = -1, keepdim = True); pow_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_155: "f32[s72, 1][1, 1]cuda:0" = variance_51 + 1e-05; variance_51 = None + rsqrt_51: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_155); add_155 = None + x_281: "f32[s72, 4096][4096, 1]cuda:0" = x_280 * rsqrt_51; x_280 = rsqrt_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_282: "bf16[s72, 4096][4096, 1]cuda:0" = x_281.to(torch.bfloat16); x_281 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_283: "bf16[s72, 4096][4096, 1]cuda:0" = x_282 * _get_data_attr_51; x_282 = _get_data_attr_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_103: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_283, l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_283 = l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_726: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_103[(Ellipsis, slice(None, 7168, None))] + silu_25: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_726); getitem_726 = None + getitem_727: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_103[(Ellipsis, slice(7168, None, None))]; output_parallel_103 = None + x_284: "bf16[s72, 7168][7168, 1]cuda:0" = silu_25 * getitem_727; silu_25 = getitem_727 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_104: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_284, l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_, None); x_284 = l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_156: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_104, group_name = 'tp:0'); output_parallel_104 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_52: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_285: "f32[s72, 4096][4096, 1]cuda:0" = output_156.to(torch.float32); output_156 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_286: "f32[s72, 4096][4096, 1]cuda:0" = x_285 + residual_50; x_285 = residual_50 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_51: "bf16[s72, 4096][4096, 1]cuda:0" = x_286.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_53: "f32[s72, 4096][4096, 1]cuda:0" = x_286.pow(2) + variance_52: "f32[s72, 1][1, 1]cuda:0" = pow_53.mean(dim = -1, keepdim = True); pow_53 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_157: "f32[s72, 1][1, 1]cuda:0" = variance_52 + 1e-05; variance_52 = None + rsqrt_52: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_157); add_157 = None + x_287: "f32[s72, 4096][4096, 1]cuda:0" = x_286 * rsqrt_52; x_286 = rsqrt_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_288: "bf16[s72, 4096][4096, 1]cuda:0" = x_287.to(torch.bfloat16); x_287 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_289: "bf16[s72, 4096][4096, 1]cuda:0" = x_288 * _get_data_attr_52; x_288 = _get_data_attr_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_105: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_289, l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_289 = l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_26 = output_parallel_105.split([2048, 512, 512], dim = -1); output_parallel_105 = None + q_26: "bf16[s72, 2048][3072, 1]cuda:0" = split_26[0] + k_26: "bf16[s72, 512][3072, 1]cuda:0" = split_26[1] + v_26: "bf16[s72, 512][3072, 1]cuda:0" = split_26[2]; split_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_26: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_26: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_26); positions_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_78 = cos_sin_26.chunk(2, dim = -1); cos_sin_26 = None + cos_78: "bf16[s72, 64][128, 1]cuda:0" = chunk_78[0] + sin_78: "bf16[s72, 64][128, 1]cuda:0" = chunk_78[1]; chunk_78 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_184 = q_26.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_78: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_26.view(s72, -1, 128); q_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_26: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_78[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_26: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_78[(Ellipsis, slice(128, None, None))]; query_78 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_105: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_78.unsqueeze(-2) + cos_79: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_105.to(torch.bfloat16); unsqueeze_105 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_106: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_78.unsqueeze(-2) + sin_79: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_106.to(torch.bfloat16); unsqueeze_106 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_79 = torch.chunk(query_rot_26, 2, dim = -1); query_rot_26 = None + x1_52: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_79[0] + x2_52: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_79[1]; chunk_79 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_343: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_52 * cos_79 + mul_344: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_52 * sin_79 + o1_52: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_343 - mul_344; mul_343 = mul_344 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_345: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_52 * cos_79; x2_52 = cos_79 = None + mul_346: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_52 * sin_79; x1_52 = sin_79 = None + o2_52: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_345 + mul_346; mul_345 = mul_346 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_157: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_52, o2_52), dim = -1); o1_52 = o2_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_105: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_157, query_pass_26), dim = -1); output_157 = query_pass_26 = None + query_79: "bf16[s72, 2048][2048, 1]cuda:0" = cat_105.reshape(size_184); cat_105 = size_184 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_185 = k_26.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_78: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_26.view(s72, -1, 128); k_26 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_26: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_78[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_26: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_78[(Ellipsis, slice(128, None, None))]; key_78 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_107: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_78.unsqueeze(-2); cos_78 = None + cos_80: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_107.to(torch.bfloat16); unsqueeze_107 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_108: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_78.unsqueeze(-2); sin_78 = None + sin_80: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_108.to(torch.bfloat16); unsqueeze_108 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_80 = torch.chunk(key_rot_26, 2, dim = -1); key_rot_26 = None + x1_53: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_80[0] + x2_53: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_80[1]; chunk_80 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_347: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_53 * cos_80 + mul_348: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_53 * sin_80 + o1_53: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_347 - mul_348; mul_347 = mul_348 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_349: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_53 * cos_80; x2_53 = cos_80 = None + mul_350: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_53 * sin_80; x1_53 = sin_80 = None + o2_53: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_349 + mul_350; mul_349 = mul_350 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_158: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_53, o2_53), dim = -1); o1_53 = o2_53 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_107: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_158, key_pass_26), dim = -1); output_158 = key_pass_26 = None + key_79: "bf16[s72, 512][512, 1]cuda:0" = cat_107.reshape(size_185); cat_107 = size_185 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_186 = query_79.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_159: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_186, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_186 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_80: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_79.view(-1, 16, 128); query_79 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_160: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_159.view(-1, 16, 128); output_159 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_80: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_79.view(-1, 4, 128); key_79 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_26: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_26.view(-1, 4, 128); v_26 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_26 = torch.ops.vllm.unified_attention_with_output(query_80, key_80, value_26, output_160, 'model.layers.26.self_attn.attn'); query_80 = key_80 = value_26 = unified_attention_with_output_26 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_26: "bf16[s72, 2048][2048, 1]cuda:0" = output_160.view(-1, 2048); output_160 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_106: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_26, l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_26 = l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_161: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_106, group_name = 'tp:0'); output_parallel_106 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_53: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_290: "f32[s72, 4096][4096, 1]cuda:0" = output_161.to(torch.float32); output_161 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_291: "f32[s72, 4096][4096, 1]cuda:0" = x_290 + residual_51; x_290 = residual_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_52: "bf16[s72, 4096][4096, 1]cuda:0" = x_291.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_54: "f32[s72, 4096][4096, 1]cuda:0" = x_291.pow(2) + variance_53: "f32[s72, 1][1, 1]cuda:0" = pow_54.mean(dim = -1, keepdim = True); pow_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_161: "f32[s72, 1][1, 1]cuda:0" = variance_53 + 1e-05; variance_53 = None + rsqrt_53: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_161); add_161 = None + x_292: "f32[s72, 4096][4096, 1]cuda:0" = x_291 * rsqrt_53; x_291 = rsqrt_53 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_293: "bf16[s72, 4096][4096, 1]cuda:0" = x_292.to(torch.bfloat16); x_292 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_294: "bf16[s72, 4096][4096, 1]cuda:0" = x_293 * _get_data_attr_53; x_293 = _get_data_attr_53 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_107: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_294, l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_294 = l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_754: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_107[(Ellipsis, slice(None, 7168, None))] + silu_26: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_754); getitem_754 = None + getitem_755: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_107[(Ellipsis, slice(7168, None, None))]; output_parallel_107 = None + x_295: "bf16[s72, 7168][7168, 1]cuda:0" = silu_26 * getitem_755; silu_26 = getitem_755 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_108: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_295, l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_, None); x_295 = l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_162: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_108, group_name = 'tp:0'); output_parallel_108 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_54: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_296: "f32[s72, 4096][4096, 1]cuda:0" = output_162.to(torch.float32); output_162 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_297: "f32[s72, 4096][4096, 1]cuda:0" = x_296 + residual_52; x_296 = residual_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_53: "bf16[s72, 4096][4096, 1]cuda:0" = x_297.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_55: "f32[s72, 4096][4096, 1]cuda:0" = x_297.pow(2) + variance_54: "f32[s72, 1][1, 1]cuda:0" = pow_55.mean(dim = -1, keepdim = True); pow_55 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_163: "f32[s72, 1][1, 1]cuda:0" = variance_54 + 1e-05; variance_54 = None + rsqrt_54: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_163); add_163 = None + x_298: "f32[s72, 4096][4096, 1]cuda:0" = x_297 * rsqrt_54; x_297 = rsqrt_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_299: "bf16[s72, 4096][4096, 1]cuda:0" = x_298.to(torch.bfloat16); x_298 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_300: "bf16[s72, 4096][4096, 1]cuda:0" = x_299 * _get_data_attr_54; x_299 = _get_data_attr_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_109: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_300, l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_300 = l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_27 = output_parallel_109.split([2048, 512, 512], dim = -1); output_parallel_109 = None + q_27: "bf16[s72, 2048][3072, 1]cuda:0" = split_27[0] + k_27: "bf16[s72, 512][3072, 1]cuda:0" = split_27[1] + v_27: "bf16[s72, 512][3072, 1]cuda:0" = split_27[2]; split_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_27: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_27: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_27); positions_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_81 = cos_sin_27.chunk(2, dim = -1); cos_sin_27 = None + cos_81: "bf16[s72, 64][128, 1]cuda:0" = chunk_81[0] + sin_81: "bf16[s72, 64][128, 1]cuda:0" = chunk_81[1]; chunk_81 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_191 = q_27.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_81: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_27.view(s72, -1, 128); q_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_27: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_81[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_27: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_81[(Ellipsis, slice(128, None, None))]; query_81 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_109: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_81.unsqueeze(-2) + cos_82: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_109.to(torch.bfloat16); unsqueeze_109 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_110: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_81.unsqueeze(-2) + sin_82: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_110.to(torch.bfloat16); unsqueeze_110 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_82 = torch.chunk(query_rot_27, 2, dim = -1); query_rot_27 = None + x1_54: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_82[0] + x2_54: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_82[1]; chunk_82 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_356: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_54 * cos_82 + mul_357: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_54 * sin_82 + o1_54: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_356 - mul_357; mul_356 = mul_357 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_358: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_54 * cos_82; x2_54 = cos_82 = None + mul_359: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_54 * sin_82; x1_54 = sin_82 = None + o2_54: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_358 + mul_359; mul_358 = mul_359 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_163: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_54, o2_54), dim = -1); o1_54 = o2_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_109: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_163, query_pass_27), dim = -1); output_163 = query_pass_27 = None + query_82: "bf16[s72, 2048][2048, 1]cuda:0" = cat_109.reshape(size_191); cat_109 = size_191 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_192 = k_27.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_81: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_27.view(s72, -1, 128); k_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_27: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_81[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_27: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_81[(Ellipsis, slice(128, None, None))]; key_81 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_111: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_81.unsqueeze(-2); cos_81 = None + cos_83: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_111.to(torch.bfloat16); unsqueeze_111 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_112: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_81.unsqueeze(-2); sin_81 = None + sin_83: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_112.to(torch.bfloat16); unsqueeze_112 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_83 = torch.chunk(key_rot_27, 2, dim = -1); key_rot_27 = None + x1_55: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_83[0] + x2_55: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_83[1]; chunk_83 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_360: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_55 * cos_83 + mul_361: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_55 * sin_83 + o1_55: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_360 - mul_361; mul_360 = mul_361 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_362: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_55 * cos_83; x2_55 = cos_83 = None + mul_363: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_55 * sin_83; x1_55 = sin_83 = None + o2_55: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_362 + mul_363; mul_362 = mul_363 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_164: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_55, o2_55), dim = -1); o1_55 = o2_55 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_111: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_164, key_pass_27), dim = -1); output_164 = key_pass_27 = None + key_82: "bf16[s72, 512][512, 1]cuda:0" = cat_111.reshape(size_192); cat_111 = size_192 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_193 = query_82.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_165: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_193, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_193 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_83: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_82.view(-1, 16, 128); query_82 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_166: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_165.view(-1, 16, 128); output_165 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_83: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_82.view(-1, 4, 128); key_82 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_27: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_27.view(-1, 4, 128); v_27 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_27 = torch.ops.vllm.unified_attention_with_output(query_83, key_83, value_27, output_166, 'model.layers.27.self_attn.attn'); query_83 = key_83 = value_27 = unified_attention_with_output_27 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_27: "bf16[s72, 2048][2048, 1]cuda:0" = output_166.view(-1, 2048); output_166 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_110: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_27, l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_27 = l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_167: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_110, group_name = 'tp:0'); output_parallel_110 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_55: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_301: "f32[s72, 4096][4096, 1]cuda:0" = output_167.to(torch.float32); output_167 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_302: "f32[s72, 4096][4096, 1]cuda:0" = x_301 + residual_53; x_301 = residual_53 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_54: "bf16[s72, 4096][4096, 1]cuda:0" = x_302.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_56: "f32[s72, 4096][4096, 1]cuda:0" = x_302.pow(2) + variance_55: "f32[s72, 1][1, 1]cuda:0" = pow_56.mean(dim = -1, keepdim = True); pow_56 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_167: "f32[s72, 1][1, 1]cuda:0" = variance_55 + 1e-05; variance_55 = None + rsqrt_55: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_167); add_167 = None + x_303: "f32[s72, 4096][4096, 1]cuda:0" = x_302 * rsqrt_55; x_302 = rsqrt_55 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_304: "bf16[s72, 4096][4096, 1]cuda:0" = x_303.to(torch.bfloat16); x_303 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_305: "bf16[s72, 4096][4096, 1]cuda:0" = x_304 * _get_data_attr_55; x_304 = _get_data_attr_55 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_111: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_305, l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_305 = l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_782: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_111[(Ellipsis, slice(None, 7168, None))] + silu_27: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_782); getitem_782 = None + getitem_783: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_111[(Ellipsis, slice(7168, None, None))]; output_parallel_111 = None + x_306: "bf16[s72, 7168][7168, 1]cuda:0" = silu_27 * getitem_783; silu_27 = getitem_783 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_112: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_306, l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_, None); x_306 = l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_168: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_112, group_name = 'tp:0'); output_parallel_112 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_56: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_307: "f32[s72, 4096][4096, 1]cuda:0" = output_168.to(torch.float32); output_168 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_308: "f32[s72, 4096][4096, 1]cuda:0" = x_307 + residual_54; x_307 = residual_54 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_55: "bf16[s72, 4096][4096, 1]cuda:0" = x_308.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_57: "f32[s72, 4096][4096, 1]cuda:0" = x_308.pow(2) + variance_56: "f32[s72, 1][1, 1]cuda:0" = pow_57.mean(dim = -1, keepdim = True); pow_57 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_169: "f32[s72, 1][1, 1]cuda:0" = variance_56 + 1e-05; variance_56 = None + rsqrt_56: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_169); add_169 = None + x_309: "f32[s72, 4096][4096, 1]cuda:0" = x_308 * rsqrt_56; x_308 = rsqrt_56 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_310: "bf16[s72, 4096][4096, 1]cuda:0" = x_309.to(torch.bfloat16); x_309 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_311: "bf16[s72, 4096][4096, 1]cuda:0" = x_310 * _get_data_attr_56; x_310 = _get_data_attr_56 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_113: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_311, l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_311 = l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_28 = output_parallel_113.split([2048, 512, 512], dim = -1); output_parallel_113 = None + q_28: "bf16[s72, 2048][3072, 1]cuda:0" = split_28[0] + k_28: "bf16[s72, 512][3072, 1]cuda:0" = split_28[1] + v_28: "bf16[s72, 512][3072, 1]cuda:0" = split_28[2]; split_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_28: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_28: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_28); positions_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_84 = cos_sin_28.chunk(2, dim = -1); cos_sin_28 = None + cos_84: "bf16[s72, 64][128, 1]cuda:0" = chunk_84[0] + sin_84: "bf16[s72, 64][128, 1]cuda:0" = chunk_84[1]; chunk_84 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_198 = q_28.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_84: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_28.view(s72, -1, 128); q_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_28: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_84[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_28: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_84[(Ellipsis, slice(128, None, None))]; query_84 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_113: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_84.unsqueeze(-2) + cos_85: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_113.to(torch.bfloat16); unsqueeze_113 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_114: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_84.unsqueeze(-2) + sin_85: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_114.to(torch.bfloat16); unsqueeze_114 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_85 = torch.chunk(query_rot_28, 2, dim = -1); query_rot_28 = None + x1_56: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_85[0] + x2_56: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_85[1]; chunk_85 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_369: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_56 * cos_85 + mul_370: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_56 * sin_85 + o1_56: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_369 - mul_370; mul_369 = mul_370 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_371: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_56 * cos_85; x2_56 = cos_85 = None + mul_372: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_56 * sin_85; x1_56 = sin_85 = None + o2_56: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_371 + mul_372; mul_371 = mul_372 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_169: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_56, o2_56), dim = -1); o1_56 = o2_56 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_113: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_169, query_pass_28), dim = -1); output_169 = query_pass_28 = None + query_85: "bf16[s72, 2048][2048, 1]cuda:0" = cat_113.reshape(size_198); cat_113 = size_198 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_199 = k_28.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_84: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_28.view(s72, -1, 128); k_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_28: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_84[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_28: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_84[(Ellipsis, slice(128, None, None))]; key_84 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_115: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_84.unsqueeze(-2); cos_84 = None + cos_86: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_115.to(torch.bfloat16); unsqueeze_115 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_116: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_84.unsqueeze(-2); sin_84 = None + sin_86: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_116.to(torch.bfloat16); unsqueeze_116 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_86 = torch.chunk(key_rot_28, 2, dim = -1); key_rot_28 = None + x1_57: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_86[0] + x2_57: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_86[1]; chunk_86 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_373: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_57 * cos_86 + mul_374: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_57 * sin_86 + o1_57: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_373 - mul_374; mul_373 = mul_374 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_375: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_57 * cos_86; x2_57 = cos_86 = None + mul_376: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_57 * sin_86; x1_57 = sin_86 = None + o2_57: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_375 + mul_376; mul_375 = mul_376 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_170: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_57, o2_57), dim = -1); o1_57 = o2_57 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_115: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_170, key_pass_28), dim = -1); output_170 = key_pass_28 = None + key_85: "bf16[s72, 512][512, 1]cuda:0" = cat_115.reshape(size_199); cat_115 = size_199 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_200 = query_85.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_171: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_200, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_200 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_86: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_85.view(-1, 16, 128); query_85 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_172: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_171.view(-1, 16, 128); output_171 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_86: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_85.view(-1, 4, 128); key_85 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_28: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_28.view(-1, 4, 128); v_28 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_28 = torch.ops.vllm.unified_attention_with_output(query_86, key_86, value_28, output_172, 'model.layers.28.self_attn.attn'); query_86 = key_86 = value_28 = unified_attention_with_output_28 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_28: "bf16[s72, 2048][2048, 1]cuda:0" = output_172.view(-1, 2048); output_172 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_114: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_28, l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_28 = l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_173: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_114, group_name = 'tp:0'); output_parallel_114 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_57: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_312: "f32[s72, 4096][4096, 1]cuda:0" = output_173.to(torch.float32); output_173 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_313: "f32[s72, 4096][4096, 1]cuda:0" = x_312 + residual_55; x_312 = residual_55 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_56: "bf16[s72, 4096][4096, 1]cuda:0" = x_313.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_58: "f32[s72, 4096][4096, 1]cuda:0" = x_313.pow(2) + variance_57: "f32[s72, 1][1, 1]cuda:0" = pow_58.mean(dim = -1, keepdim = True); pow_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_173: "f32[s72, 1][1, 1]cuda:0" = variance_57 + 1e-05; variance_57 = None + rsqrt_57: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_173); add_173 = None + x_314: "f32[s72, 4096][4096, 1]cuda:0" = x_313 * rsqrt_57; x_313 = rsqrt_57 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_315: "bf16[s72, 4096][4096, 1]cuda:0" = x_314.to(torch.bfloat16); x_314 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_316: "bf16[s72, 4096][4096, 1]cuda:0" = x_315 * _get_data_attr_57; x_315 = _get_data_attr_57 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_115: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_316, l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_316 = l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_810: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_115[(Ellipsis, slice(None, 7168, None))] + silu_28: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_810); getitem_810 = None + getitem_811: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_115[(Ellipsis, slice(7168, None, None))]; output_parallel_115 = None + x_317: "bf16[s72, 7168][7168, 1]cuda:0" = silu_28 * getitem_811; silu_28 = getitem_811 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_116: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_317, l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_, None); x_317 = l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_174: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_116, group_name = 'tp:0'); output_parallel_116 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_58: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_318: "f32[s72, 4096][4096, 1]cuda:0" = output_174.to(torch.float32); output_174 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_319: "f32[s72, 4096][4096, 1]cuda:0" = x_318 + residual_56; x_318 = residual_56 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_57: "bf16[s72, 4096][4096, 1]cuda:0" = x_319.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_59: "f32[s72, 4096][4096, 1]cuda:0" = x_319.pow(2) + variance_58: "f32[s72, 1][1, 1]cuda:0" = pow_59.mean(dim = -1, keepdim = True); pow_59 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_175: "f32[s72, 1][1, 1]cuda:0" = variance_58 + 1e-05; variance_58 = None + rsqrt_58: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_175); add_175 = None + x_320: "f32[s72, 4096][4096, 1]cuda:0" = x_319 * rsqrt_58; x_319 = rsqrt_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_321: "bf16[s72, 4096][4096, 1]cuda:0" = x_320.to(torch.bfloat16); x_320 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_322: "bf16[s72, 4096][4096, 1]cuda:0" = x_321 * _get_data_attr_58; x_321 = _get_data_attr_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_117: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_322, l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_322 = l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_29 = output_parallel_117.split([2048, 512, 512], dim = -1); output_parallel_117 = None + q_29: "bf16[s72, 2048][3072, 1]cuda:0" = split_29[0] + k_29: "bf16[s72, 512][3072, 1]cuda:0" = split_29[1] + v_29: "bf16[s72, 512][3072, 1]cuda:0" = split_29[2]; split_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_29: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_29: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_29); positions_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_87 = cos_sin_29.chunk(2, dim = -1); cos_sin_29 = None + cos_87: "bf16[s72, 64][128, 1]cuda:0" = chunk_87[0] + sin_87: "bf16[s72, 64][128, 1]cuda:0" = chunk_87[1]; chunk_87 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_205 = q_29.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_87: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_29.view(s72, -1, 128); q_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_29: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_87[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_29: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_87[(Ellipsis, slice(128, None, None))]; query_87 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_117: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_87.unsqueeze(-2) + cos_88: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_117.to(torch.bfloat16); unsqueeze_117 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_118: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_87.unsqueeze(-2) + sin_88: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_118.to(torch.bfloat16); unsqueeze_118 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_88 = torch.chunk(query_rot_29, 2, dim = -1); query_rot_29 = None + x1_58: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_88[0] + x2_58: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_88[1]; chunk_88 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_382: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_58 * cos_88 + mul_383: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_58 * sin_88 + o1_58: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_382 - mul_383; mul_382 = mul_383 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_384: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_58 * cos_88; x2_58 = cos_88 = None + mul_385: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_58 * sin_88; x1_58 = sin_88 = None + o2_58: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_384 + mul_385; mul_384 = mul_385 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_175: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_58, o2_58), dim = -1); o1_58 = o2_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_117: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_175, query_pass_29), dim = -1); output_175 = query_pass_29 = None + query_88: "bf16[s72, 2048][2048, 1]cuda:0" = cat_117.reshape(size_205); cat_117 = size_205 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_206 = k_29.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_87: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_29.view(s72, -1, 128); k_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_29: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_87[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_29: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_87[(Ellipsis, slice(128, None, None))]; key_87 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_119: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_87.unsqueeze(-2); cos_87 = None + cos_89: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_119.to(torch.bfloat16); unsqueeze_119 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_120: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_87.unsqueeze(-2); sin_87 = None + sin_89: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_120.to(torch.bfloat16); unsqueeze_120 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_89 = torch.chunk(key_rot_29, 2, dim = -1); key_rot_29 = None + x1_59: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_89[0] + x2_59: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_89[1]; chunk_89 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_386: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_59 * cos_89 + mul_387: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_59 * sin_89 + o1_59: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_386 - mul_387; mul_386 = mul_387 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_388: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_59 * cos_89; x2_59 = cos_89 = None + mul_389: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_59 * sin_89; x1_59 = sin_89 = None + o2_59: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_388 + mul_389; mul_388 = mul_389 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_176: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_59, o2_59), dim = -1); o1_59 = o2_59 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_119: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_176, key_pass_29), dim = -1); output_176 = key_pass_29 = None + key_88: "bf16[s72, 512][512, 1]cuda:0" = cat_119.reshape(size_206); cat_119 = size_206 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_207 = query_88.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_177: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_207, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_207 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_89: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_88.view(-1, 16, 128); query_88 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_178: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_177.view(-1, 16, 128); output_177 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_89: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_88.view(-1, 4, 128); key_88 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_29: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_29.view(-1, 4, 128); v_29 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_29 = torch.ops.vllm.unified_attention_with_output(query_89, key_89, value_29, output_178, 'model.layers.29.self_attn.attn'); query_89 = key_89 = value_29 = unified_attention_with_output_29 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_29: "bf16[s72, 2048][2048, 1]cuda:0" = output_178.view(-1, 2048); output_178 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_118: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_29, l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_29 = l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_179: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_118, group_name = 'tp:0'); output_parallel_118 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_59: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_323: "f32[s72, 4096][4096, 1]cuda:0" = output_179.to(torch.float32); output_179 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_324: "f32[s72, 4096][4096, 1]cuda:0" = x_323 + residual_57; x_323 = residual_57 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_58: "bf16[s72, 4096][4096, 1]cuda:0" = x_324.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_60: "f32[s72, 4096][4096, 1]cuda:0" = x_324.pow(2) + variance_59: "f32[s72, 1][1, 1]cuda:0" = pow_60.mean(dim = -1, keepdim = True); pow_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_179: "f32[s72, 1][1, 1]cuda:0" = variance_59 + 1e-05; variance_59 = None + rsqrt_59: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_179); add_179 = None + x_325: "f32[s72, 4096][4096, 1]cuda:0" = x_324 * rsqrt_59; x_324 = rsqrt_59 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_326: "bf16[s72, 4096][4096, 1]cuda:0" = x_325.to(torch.bfloat16); x_325 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_327: "bf16[s72, 4096][4096, 1]cuda:0" = x_326 * _get_data_attr_59; x_326 = _get_data_attr_59 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_119: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_327, l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_327 = l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_838: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_119[(Ellipsis, slice(None, 7168, None))] + silu_29: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_838); getitem_838 = None + getitem_839: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_119[(Ellipsis, slice(7168, None, None))]; output_parallel_119 = None + x_328: "bf16[s72, 7168][7168, 1]cuda:0" = silu_29 * getitem_839; silu_29 = getitem_839 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_120: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_328, l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_, None); x_328 = l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_180: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_120, group_name = 'tp:0'); output_parallel_120 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_60: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_329: "f32[s72, 4096][4096, 1]cuda:0" = output_180.to(torch.float32); output_180 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_330: "f32[s72, 4096][4096, 1]cuda:0" = x_329 + residual_58; x_329 = residual_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_59: "bf16[s72, 4096][4096, 1]cuda:0" = x_330.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_61: "f32[s72, 4096][4096, 1]cuda:0" = x_330.pow(2) + variance_60: "f32[s72, 1][1, 1]cuda:0" = pow_61.mean(dim = -1, keepdim = True); pow_61 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_181: "f32[s72, 1][1, 1]cuda:0" = variance_60 + 1e-05; variance_60 = None + rsqrt_60: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_181); add_181 = None + x_331: "f32[s72, 4096][4096, 1]cuda:0" = x_330 * rsqrt_60; x_330 = rsqrt_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_332: "bf16[s72, 4096][4096, 1]cuda:0" = x_331.to(torch.bfloat16); x_331 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_333: "bf16[s72, 4096][4096, 1]cuda:0" = x_332 * _get_data_attr_60; x_332 = _get_data_attr_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_121: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_333, l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_333 = l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_30 = output_parallel_121.split([2048, 512, 512], dim = -1); output_parallel_121 = None + q_30: "bf16[s72, 2048][3072, 1]cuda:0" = split_30[0] + k_30: "bf16[s72, 512][3072, 1]cuda:0" = split_30[1] + v_30: "bf16[s72, 512][3072, 1]cuda:0" = split_30[2]; split_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_30: "i64[s72][1]cuda:0" = l_positions_.flatten() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_30: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_30); positions_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_90 = cos_sin_30.chunk(2, dim = -1); cos_sin_30 = None + cos_90: "bf16[s72, 64][128, 1]cuda:0" = chunk_90[0] + sin_90: "bf16[s72, 64][128, 1]cuda:0" = chunk_90[1]; chunk_90 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_212 = q_30.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_90: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_30.view(s72, -1, 128); q_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_30: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_90[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_30: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_90[(Ellipsis, slice(128, None, None))]; query_90 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_121: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_90.unsqueeze(-2) + cos_91: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_121.to(torch.bfloat16); unsqueeze_121 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_122: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_90.unsqueeze(-2) + sin_91: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_122.to(torch.bfloat16); unsqueeze_122 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_91 = torch.chunk(query_rot_30, 2, dim = -1); query_rot_30 = None + x1_60: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_91[0] + x2_60: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_91[1]; chunk_91 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_395: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_60 * cos_91 + mul_396: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_60 * sin_91 + o1_60: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_395 - mul_396; mul_395 = mul_396 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_397: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_60 * cos_91; x2_60 = cos_91 = None + mul_398: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_60 * sin_91; x1_60 = sin_91 = None + o2_60: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_397 + mul_398; mul_397 = mul_398 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_181: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_60, o2_60), dim = -1); o1_60 = o2_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_121: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_181, query_pass_30), dim = -1); output_181 = query_pass_30 = None + query_91: "bf16[s72, 2048][2048, 1]cuda:0" = cat_121.reshape(size_212); cat_121 = size_212 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_213 = k_30.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_90: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_30.view(s72, -1, 128); k_30 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_30: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_90[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_30: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_90[(Ellipsis, slice(128, None, None))]; key_90 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_123: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_90.unsqueeze(-2); cos_90 = None + cos_92: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_123.to(torch.bfloat16); unsqueeze_123 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_124: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_90.unsqueeze(-2); sin_90 = None + sin_92: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_124.to(torch.bfloat16); unsqueeze_124 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_92 = torch.chunk(key_rot_30, 2, dim = -1); key_rot_30 = None + x1_61: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_92[0] + x2_61: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_92[1]; chunk_92 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_399: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_61 * cos_92 + mul_400: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_61 * sin_92 + o1_61: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_399 - mul_400; mul_399 = mul_400 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_401: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_61 * cos_92; x2_61 = cos_92 = None + mul_402: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_61 * sin_92; x1_61 = sin_92 = None + o2_61: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_401 + mul_402; mul_401 = mul_402 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_182: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_61, o2_61), dim = -1); o1_61 = o2_61 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_123: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_182, key_pass_30), dim = -1); output_182 = key_pass_30 = None + key_91: "bf16[s72, 512][512, 1]cuda:0" = cat_123.reshape(size_213); cat_123 = size_213 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_214 = query_91.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_183: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_214, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_214 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_92: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_91.view(-1, 16, 128); query_91 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_184: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_183.view(-1, 16, 128); output_183 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_92: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_91.view(-1, 4, 128); key_91 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_30: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_30.view(-1, 4, 128); v_30 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_30 = torch.ops.vllm.unified_attention_with_output(query_92, key_92, value_30, output_184, 'model.layers.30.self_attn.attn'); query_92 = key_92 = value_30 = unified_attention_with_output_30 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_30: "bf16[s72, 2048][2048, 1]cuda:0" = output_184.view(-1, 2048); output_184 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_122: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_30, l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_30 = l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_185: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_122, group_name = 'tp:0'); output_parallel_122 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_61: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_334: "f32[s72, 4096][4096, 1]cuda:0" = output_185.to(torch.float32); output_185 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_335: "f32[s72, 4096][4096, 1]cuda:0" = x_334 + residual_59; x_334 = residual_59 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_60: "bf16[s72, 4096][4096, 1]cuda:0" = x_335.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_62: "f32[s72, 4096][4096, 1]cuda:0" = x_335.pow(2) + variance_61: "f32[s72, 1][1, 1]cuda:0" = pow_62.mean(dim = -1, keepdim = True); pow_62 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_185: "f32[s72, 1][1, 1]cuda:0" = variance_61 + 1e-05; variance_61 = None + rsqrt_61: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_185); add_185 = None + x_336: "f32[s72, 4096][4096, 1]cuda:0" = x_335 * rsqrt_61; x_335 = rsqrt_61 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_337: "bf16[s72, 4096][4096, 1]cuda:0" = x_336.to(torch.bfloat16); x_336 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_338: "bf16[s72, 4096][4096, 1]cuda:0" = x_337 * _get_data_attr_61; x_337 = _get_data_attr_61 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_123: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_338, l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_338 = l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_866: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_123[(Ellipsis, slice(None, 7168, None))] + silu_30: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_866); getitem_866 = None + getitem_867: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_123[(Ellipsis, slice(7168, None, None))]; output_parallel_123 = None + x_339: "bf16[s72, 7168][7168, 1]cuda:0" = silu_30 * getitem_867; silu_30 = getitem_867 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_124: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_339, l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_, None); x_339 = l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_186: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_124, group_name = 'tp:0'); output_parallel_124 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_62: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_340: "f32[s72, 4096][4096, 1]cuda:0" = output_186.to(torch.float32); output_186 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_341: "f32[s72, 4096][4096, 1]cuda:0" = x_340 + residual_60; x_340 = residual_60 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_61: "bf16[s72, 4096][4096, 1]cuda:0" = x_341.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_63: "f32[s72, 4096][4096, 1]cuda:0" = x_341.pow(2) + variance_62: "f32[s72, 1][1, 1]cuda:0" = pow_63.mean(dim = -1, keepdim = True); pow_63 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_187: "f32[s72, 1][1, 1]cuda:0" = variance_62 + 1e-05; variance_62 = None + rsqrt_62: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_187); add_187 = None + x_342: "f32[s72, 4096][4096, 1]cuda:0" = x_341 * rsqrt_62; x_341 = rsqrt_62 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_343: "bf16[s72, 4096][4096, 1]cuda:0" = x_342.to(torch.bfloat16); x_342 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_344: "bf16[s72, 4096][4096, 1]cuda:0" = x_343 * _get_data_attr_62; x_343 = _get_data_attr_62 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_125: "bf16[s72, 3072][3072, 1]cuda:0" = torch._C._nn.linear(x_344, l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_, None); x_344 = l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_31 = output_parallel_125.split([2048, 512, 512], dim = -1); output_parallel_125 = None + q_31: "bf16[s72, 2048][3072, 1]cuda:0" = split_31[0] + k_31: "bf16[s72, 512][3072, 1]cuda:0" = split_31[1] + v_31: "bf16[s72, 512][3072, 1]cuda:0" = split_31[2]; split_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + positions_31: "i64[s72][1]cuda:0" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + cos_sin_31: "bf16[s72, 128][128, 1]cuda:0" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, positions_31); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = positions_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk_93 = cos_sin_31.chunk(2, dim = -1); cos_sin_31 = None + cos_93: "bf16[s72, 64][128, 1]cuda:0" = chunk_93[0] + sin_93: "bf16[s72, 64][128, 1]cuda:0" = chunk_93[1]; chunk_93 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size_219 = q_31.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + query_93: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = q_31.view(s72, -1, 128); q_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + query_rot_31: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = query_93[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + query_pass_31: "bf16[s72, 16, 0][3072, 128, 1]cuda:0" = query_93[(Ellipsis, slice(128, None, None))]; query_93 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_125: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_93.unsqueeze(-2) + cos_94: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_125.to(torch.bfloat16); unsqueeze_125 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_126: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_93.unsqueeze(-2) + sin_94: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_126.to(torch.bfloat16); unsqueeze_126 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_94 = torch.chunk(query_rot_31, 2, dim = -1); query_rot_31 = None + x1_62: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_94[0] + x2_62: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = chunk_94[1]; chunk_94 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_408: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_62 * cos_94 + mul_409: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_62 * sin_94 + o1_62: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_408 - mul_409; mul_408 = mul_409 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_410: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x2_62 * cos_94; x2_62 = cos_94 = None + mul_411: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = x1_62 * sin_94; x1_62 = sin_94 = None + o2_62: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = mul_410 + mul_411; mul_410 = mul_411 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_187: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((o1_62, o2_62), dim = -1); o1_62 = o2_62 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_125: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.cat((output_187, query_pass_31), dim = -1); output_187 = query_pass_31 = None + query_94: "bf16[s72, 2048][2048, 1]cuda:0" = cat_125.reshape(size_219); cat_125 = size_219 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_220 = k_31.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + key_93: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = k_31.view(s72, -1, 128); k_31 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + key_rot_31: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = key_93[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + key_pass_31: "bf16[s72, 4, 0][3072, 128, 1]cuda:0" = key_93[(Ellipsis, slice(128, None, None))]; key_93 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_127: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = cos_93.unsqueeze(-2); cos_93 = None + cos_95: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_127.to(torch.bfloat16); unsqueeze_127 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_128: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = sin_93.unsqueeze(-2); sin_93 = None + sin_95: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = unsqueeze_128.to(torch.bfloat16); unsqueeze_128 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_95 = torch.chunk(key_rot_31, 2, dim = -1); key_rot_31 = None + x1_63: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_95[0] + x2_63: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = chunk_95[1]; chunk_95 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_412: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_63 * cos_95 + mul_413: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_63 * sin_95 + o1_63: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_412 - mul_413; mul_412 = mul_413 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_414: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x2_63 * cos_95; x2_63 = cos_95 = None + mul_415: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = x1_63 * sin_95; x1_63 = sin_95 = None + o2_63: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = mul_414 + mul_415; mul_414 = mul_415 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + output_188: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((o1_63, o2_63), dim = -1); o1_63 = o2_63 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_127: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.cat((output_188, key_pass_31), dim = -1); output_188 = key_pass_31 = None + key_94: "bf16[s72, 512][512, 1]cuda:0" = cat_127.reshape(size_220); cat_127 = size_220 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_221 = query_94.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + output_189: "bf16[s72, 2048][2048, 1]cuda:0" = torch.empty(size_221, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_221 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + query_95: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = query_94.view(-1, 16, 128); query_94 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + output_190: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = output_189.view(-1, 16, 128); output_189 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + key_95: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = key_94.view(-1, 4, 128); key_94 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + value_31: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = v_31.view(-1, 4, 128); v_31 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output_31 = torch.ops.vllm.unified_attention_with_output(query_95, key_95, value_31, output_190, 'model.layers.31.self_attn.attn'); query_95 = key_95 = value_31 = unified_attention_with_output_31 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + attn_output_31: "bf16[s72, 2048][2048, 1]cuda:0" = output_190.view(-1, 2048); output_190 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_126: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(attn_output_31, l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_, None); attn_output_31 = l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_191: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_126, group_name = 'tp:0'); output_parallel_126 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_63: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_345: "f32[s72, 4096][4096, 1]cuda:0" = output_191.to(torch.float32); output_191 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_346: "f32[s72, 4096][4096, 1]cuda:0" = x_345 + residual_61; x_345 = residual_61 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_62: "bf16[s72, 4096][4096, 1]cuda:0" = x_346.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_64: "f32[s72, 4096][4096, 1]cuda:0" = x_346.pow(2) + variance_63: "f32[s72, 1][1, 1]cuda:0" = pow_64.mean(dim = -1, keepdim = True); pow_64 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_191: "f32[s72, 1][1, 1]cuda:0" = variance_63 + 1e-05; variance_63 = None + rsqrt_63: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_191); add_191 = None + x_347: "f32[s72, 4096][4096, 1]cuda:0" = x_346 * rsqrt_63; x_346 = rsqrt_63 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_348: "bf16[s72, 4096][4096, 1]cuda:0" = x_347.to(torch.bfloat16); x_347 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_349: "bf16[s72, 4096][4096, 1]cuda:0" = x_348 * _get_data_attr_63; x_348 = _get_data_attr_63 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_127: "bf16[s72, 14336][14336, 1]cuda:0" = torch._C._nn.linear(x_349, l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_, None); x_349 = l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem_894: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_127[(Ellipsis, slice(None, 7168, None))] + silu_31: "bf16[s72, 7168][7168, 1]cuda:0" = torch.nn.functional.silu(getitem_894); getitem_894 = None + getitem_895: "bf16[s72, 7168][14336, 1]cuda:0" = output_parallel_127[(Ellipsis, slice(7168, None, None))]; output_parallel_127 = None + x_350: "bf16[s72, 7168][7168, 1]cuda:0" = silu_31 * getitem_895; silu_31 = getitem_895 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + output_parallel_128: "bf16[s72, 4096][4096, 1]cuda:0" = torch._C._nn.linear(x_350, l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_, None); x_350 = l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + output_192: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce(output_parallel_128, group_name = 'tp:0'); output_parallel_128 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_64: "bf16[4096][1]cuda:0" = torch._C._autograd._get_data_attr(l_self_modules_norm_parameters_weight_); l_self_modules_norm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + x_351: "f32[s72, 4096][4096, 1]cuda:0" = output_192.to(torch.float32); output_192 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + x_352: "f32[s72, 4096][4096, 1]cuda:0" = x_351 + residual_62; x_351 = residual_62 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + residual_63: "bf16[s72, 4096][4096, 1]cuda:0" = x_352.to(torch.bfloat16); residual_63 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_65: "f32[s72, 4096][4096, 1]cuda:0" = x_352.pow(2) + variance_64: "f32[s72, 1][1, 1]cuda:0" = pow_65.mean(dim = -1, keepdim = True); pow_65 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_193: "f32[s72, 1][1, 1]cuda:0" = variance_64 + 1e-05; variance_64 = None + rsqrt_64: "f32[s72, 1][1, 1]cuda:0" = torch.rsqrt(add_193); add_193 = None + x_353: "f32[s72, 4096][4096, 1]cuda:0" = x_352 * rsqrt_64; x_352 = rsqrt_64 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + x_354: "bf16[s72, 4096][4096, 1]cuda:0" = x_353.to(torch.bfloat16); x_353 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + x_355: "bf16[s72, 4096][4096, 1]cuda:0" = x_354 * _get_data_attr_64; x_354 = _get_data_attr_64 = None + return (x_355,) + +V0127 17:17:45.180000 1175001 /data/users/angelayi/vllm/vllm/compilation/backends.py:601] {"artifact": {"name": "vllm_compilation_config", "encoding": "json"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "e58fd7e13a289e11aed19e3686dc21e0"} + {"model": "meta-llama/Llama-3.1-8B", "prefix": "backbone", "mode": "3", "backend": "inductor", "custom_ops": "none", "splitting_ops": "vllm::unified_attention, vllm::unified_attention_with_output, vllm::unified_mla_attention, vllm::unified_mla_attention_with_output, vllm::mamba_mixer2, vllm::mamba_mixer, vllm::short_conv, vllm::linear_attention, vllm::plamo2_mamba_mixer, vllm::gdn_attention_core, vllm::kda_attention, vllm::sparse_attn_indexer", "cudagraph_mode": "FULL_AND_PIECEWISE", "compile_sizes": "8, 4", "compile_ranges_split_points": "16384", "use_inductor_graph_partition": false, "inductor_passes": "", "enabled_passes": "eliminate_noops", "dynamic_shapes_type": "DynamicShapesType.BACKED", "dynamic_shapes_evaluate_guards": false} +V0127 17:17:45.871000 1175001 /data/users/angelayi/vllm/vllm/compilation/backends.py:782] {"graph_dump": {"name": "vllm_piecewise_split_graph"}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "3f2cf3eb23ce2ac98c893063242a223e"} + class GraphModule(torch.nn.Module): + def forward(self, s72: "Sym(s72)", L_input_ids_: "i32[s72]", L_self_modules_embed_tokens_parameters_weight_: "bf16[64128, 4096]", L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]", s80: "Sym(s72)", L_positions_: "i64[s72]", L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", L_self_modules_norm_parameters_weight_: "bf16[4096]"): + l_input_ids_ = L_input_ids_ + l_self_modules_embed_tokens_parameters_weight_ = L_self_modules_embed_tokens_parameters_weight_ + l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = L_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ + l_positions_ = L_positions_ + l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = L_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = L_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ + l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ + l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = L_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ + l_self_modules_norm_parameters_weight_ = L_self_modules_norm_parameters_weight_ + + # No stacktrace found for following nodes + submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem = submod_0[0] + getitem_1 = submod_0[1] + getitem_2 = submod_0[2] + getitem_3 = submod_0[3] + getitem_4 = submod_0[4]; submod_0 = None + submod_1 = self.submod_1(getitem, s72, getitem_1, getitem_2, getitem_3); getitem = getitem_1 = getitem_2 = submod_1 = None + submod_2 = self.submod_2(getitem_3, s72, l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_, getitem_4, l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_3 = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = getitem_4 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_5 = submod_2[0] + getitem_6 = submod_2[1] + getitem_7 = submod_2[2] + getitem_8 = submod_2[3] + getitem_9 = submod_2[4]; submod_2 = None + submod_3 = self.submod_3(getitem_5, s72, getitem_6, getitem_7, getitem_8); getitem_5 = getitem_6 = getitem_7 = submod_3 = None + submod_4 = self.submod_4(getitem_8, s72, l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_, getitem_9, l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_8 = l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = getitem_9 = l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_10 = submod_4[0] + getitem_11 = submod_4[1] + getitem_12 = submod_4[2] + getitem_13 = submod_4[3] + getitem_14 = submod_4[4]; submod_4 = None + submod_5 = self.submod_5(getitem_10, s72, getitem_11, getitem_12, getitem_13); getitem_10 = getitem_11 = getitem_12 = submod_5 = None + submod_6 = self.submod_6(getitem_13, s72, l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_, getitem_14, l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_13 = l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = getitem_14 = l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_15 = submod_6[0] + getitem_16 = submod_6[1] + getitem_17 = submod_6[2] + getitem_18 = submod_6[3] + getitem_19 = submod_6[4]; submod_6 = None + submod_7 = self.submod_7(getitem_15, s72, getitem_16, getitem_17, getitem_18); getitem_15 = getitem_16 = getitem_17 = submod_7 = None + submod_8 = self.submod_8(getitem_18, s72, l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_, getitem_19, l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_18 = l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = getitem_19 = l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_20 = submod_8[0] + getitem_21 = submod_8[1] + getitem_22 = submod_8[2] + getitem_23 = submod_8[3] + getitem_24 = submod_8[4]; submod_8 = None + submod_9 = self.submod_9(getitem_20, s72, getitem_21, getitem_22, getitem_23); getitem_20 = getitem_21 = getitem_22 = submod_9 = None + submod_10 = self.submod_10(getitem_23, s72, l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_, getitem_24, l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_23 = l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = getitem_24 = l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_25 = submod_10[0] + getitem_26 = submod_10[1] + getitem_27 = submod_10[2] + getitem_28 = submod_10[3] + getitem_29 = submod_10[4]; submod_10 = None + submod_11 = self.submod_11(getitem_25, s72, getitem_26, getitem_27, getitem_28); getitem_25 = getitem_26 = getitem_27 = submod_11 = None + submod_12 = self.submod_12(getitem_28, s72, l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_, getitem_29, l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_28 = l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = getitem_29 = l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_30 = submod_12[0] + getitem_31 = submod_12[1] + getitem_32 = submod_12[2] + getitem_33 = submod_12[3] + getitem_34 = submod_12[4]; submod_12 = None + submod_13 = self.submod_13(getitem_30, s72, getitem_31, getitem_32, getitem_33); getitem_30 = getitem_31 = getitem_32 = submod_13 = None + submod_14 = self.submod_14(getitem_33, s72, l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_, getitem_34, l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_33 = l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = getitem_34 = l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_35 = submod_14[0] + getitem_36 = submod_14[1] + getitem_37 = submod_14[2] + getitem_38 = submod_14[3] + getitem_39 = submod_14[4]; submod_14 = None + submod_15 = self.submod_15(getitem_35, s72, getitem_36, getitem_37, getitem_38); getitem_35 = getitem_36 = getitem_37 = submod_15 = None + submod_16 = self.submod_16(getitem_38, s72, l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_, getitem_39, l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_38 = l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = getitem_39 = l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_40 = submod_16[0] + getitem_41 = submod_16[1] + getitem_42 = submod_16[2] + getitem_43 = submod_16[3] + getitem_44 = submod_16[4]; submod_16 = None + submod_17 = self.submod_17(getitem_40, s72, getitem_41, getitem_42, getitem_43); getitem_40 = getitem_41 = getitem_42 = submod_17 = None + submod_18 = self.submod_18(getitem_43, s72, l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_, getitem_44, l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_43 = l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = getitem_44 = l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_45 = submod_18[0] + getitem_46 = submod_18[1] + getitem_47 = submod_18[2] + getitem_48 = submod_18[3] + getitem_49 = submod_18[4]; submod_18 = None + submod_19 = self.submod_19(getitem_45, s72, getitem_46, getitem_47, getitem_48); getitem_45 = getitem_46 = getitem_47 = submod_19 = None + submod_20 = self.submod_20(getitem_48, s72, l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_, getitem_49, l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_48 = l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = getitem_49 = l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_50 = submod_20[0] + getitem_51 = submod_20[1] + getitem_52 = submod_20[2] + getitem_53 = submod_20[3] + getitem_54 = submod_20[4]; submod_20 = None + submod_21 = self.submod_21(getitem_50, s72, getitem_51, getitem_52, getitem_53); getitem_50 = getitem_51 = getitem_52 = submod_21 = None + submod_22 = self.submod_22(getitem_53, s72, l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_, getitem_54, l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_53 = l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = getitem_54 = l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_55 = submod_22[0] + getitem_56 = submod_22[1] + getitem_57 = submod_22[2] + getitem_58 = submod_22[3] + getitem_59 = submod_22[4]; submod_22 = None + submod_23 = self.submod_23(getitem_55, s72, getitem_56, getitem_57, getitem_58); getitem_55 = getitem_56 = getitem_57 = submod_23 = None + submod_24 = self.submod_24(getitem_58, s72, l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_, getitem_59, l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_58 = l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = getitem_59 = l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_60 = submod_24[0] + getitem_61 = submod_24[1] + getitem_62 = submod_24[2] + getitem_63 = submod_24[3] + getitem_64 = submod_24[4]; submod_24 = None + submod_25 = self.submod_25(getitem_60, s72, getitem_61, getitem_62, getitem_63); getitem_60 = getitem_61 = getitem_62 = submod_25 = None + submod_26 = self.submod_26(getitem_63, s72, l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_, getitem_64, l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_63 = l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = getitem_64 = l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_65 = submod_26[0] + getitem_66 = submod_26[1] + getitem_67 = submod_26[2] + getitem_68 = submod_26[3] + getitem_69 = submod_26[4]; submod_26 = None + submod_27 = self.submod_27(getitem_65, s72, getitem_66, getitem_67, getitem_68); getitem_65 = getitem_66 = getitem_67 = submod_27 = None + submod_28 = self.submod_28(getitem_68, s72, l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_, getitem_69, l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_68 = l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = getitem_69 = l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_70 = submod_28[0] + getitem_71 = submod_28[1] + getitem_72 = submod_28[2] + getitem_73 = submod_28[3] + getitem_74 = submod_28[4]; submod_28 = None + submod_29 = self.submod_29(getitem_70, s72, getitem_71, getitem_72, getitem_73); getitem_70 = getitem_71 = getitem_72 = submod_29 = None + submod_30 = self.submod_30(getitem_73, s72, l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_, getitem_74, l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_73 = l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = getitem_74 = l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_75 = submod_30[0] + getitem_76 = submod_30[1] + getitem_77 = submod_30[2] + getitem_78 = submod_30[3] + getitem_79 = submod_30[4]; submod_30 = None + submod_31 = self.submod_31(getitem_75, s72, getitem_76, getitem_77, getitem_78); getitem_75 = getitem_76 = getitem_77 = submod_31 = None + submod_32 = self.submod_32(getitem_78, s72, l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_, getitem_79, l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_78 = l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = getitem_79 = l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_80 = submod_32[0] + getitem_81 = submod_32[1] + getitem_82 = submod_32[2] + getitem_83 = submod_32[3] + getitem_84 = submod_32[4]; submod_32 = None + submod_33 = self.submod_33(getitem_80, s72, getitem_81, getitem_82, getitem_83); getitem_80 = getitem_81 = getitem_82 = submod_33 = None + submod_34 = self.submod_34(getitem_83, s72, l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_, getitem_84, l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_83 = l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = getitem_84 = l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_85 = submod_34[0] + getitem_86 = submod_34[1] + getitem_87 = submod_34[2] + getitem_88 = submod_34[3] + getitem_89 = submod_34[4]; submod_34 = None + submod_35 = self.submod_35(getitem_85, s72, getitem_86, getitem_87, getitem_88); getitem_85 = getitem_86 = getitem_87 = submod_35 = None + submod_36 = self.submod_36(getitem_88, s72, l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_, getitem_89, l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_88 = l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = getitem_89 = l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_90 = submod_36[0] + getitem_91 = submod_36[1] + getitem_92 = submod_36[2] + getitem_93 = submod_36[3] + getitem_94 = submod_36[4]; submod_36 = None + submod_37 = self.submod_37(getitem_90, s72, getitem_91, getitem_92, getitem_93); getitem_90 = getitem_91 = getitem_92 = submod_37 = None + submod_38 = self.submod_38(getitem_93, s72, l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_, getitem_94, l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_93 = l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = getitem_94 = l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_95 = submod_38[0] + getitem_96 = submod_38[1] + getitem_97 = submod_38[2] + getitem_98 = submod_38[3] + getitem_99 = submod_38[4]; submod_38 = None + submod_39 = self.submod_39(getitem_95, s72, getitem_96, getitem_97, getitem_98); getitem_95 = getitem_96 = getitem_97 = submod_39 = None + submod_40 = self.submod_40(getitem_98, s72, l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_, getitem_99, l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_98 = l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = getitem_99 = l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_100 = submod_40[0] + getitem_101 = submod_40[1] + getitem_102 = submod_40[2] + getitem_103 = submod_40[3] + getitem_104 = submod_40[4]; submod_40 = None + submod_41 = self.submod_41(getitem_100, s72, getitem_101, getitem_102, getitem_103); getitem_100 = getitem_101 = getitem_102 = submod_41 = None + submod_42 = self.submod_42(getitem_103, s72, l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_, getitem_104, l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_103 = l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = getitem_104 = l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_105 = submod_42[0] + getitem_106 = submod_42[1] + getitem_107 = submod_42[2] + getitem_108 = submod_42[3] + getitem_109 = submod_42[4]; submod_42 = None + submod_43 = self.submod_43(getitem_105, s72, getitem_106, getitem_107, getitem_108); getitem_105 = getitem_106 = getitem_107 = submod_43 = None + submod_44 = self.submod_44(getitem_108, s72, l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_, getitem_109, l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_108 = l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = getitem_109 = l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_110 = submod_44[0] + getitem_111 = submod_44[1] + getitem_112 = submod_44[2] + getitem_113 = submod_44[3] + getitem_114 = submod_44[4]; submod_44 = None + submod_45 = self.submod_45(getitem_110, s72, getitem_111, getitem_112, getitem_113); getitem_110 = getitem_111 = getitem_112 = submod_45 = None + submod_46 = self.submod_46(getitem_113, s72, l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_, getitem_114, l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_113 = l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = getitem_114 = l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_115 = submod_46[0] + getitem_116 = submod_46[1] + getitem_117 = submod_46[2] + getitem_118 = submod_46[3] + getitem_119 = submod_46[4]; submod_46 = None + submod_47 = self.submod_47(getitem_115, s72, getitem_116, getitem_117, getitem_118); getitem_115 = getitem_116 = getitem_117 = submod_47 = None + submod_48 = self.submod_48(getitem_118, s72, l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_, getitem_119, l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_118 = l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = getitem_119 = l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_120 = submod_48[0] + getitem_121 = submod_48[1] + getitem_122 = submod_48[2] + getitem_123 = submod_48[3] + getitem_124 = submod_48[4]; submod_48 = None + submod_49 = self.submod_49(getitem_120, s72, getitem_121, getitem_122, getitem_123); getitem_120 = getitem_121 = getitem_122 = submod_49 = None + submod_50 = self.submod_50(getitem_123, s72, l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_, getitem_124, l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_123 = l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = getitem_124 = l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_125 = submod_50[0] + getitem_126 = submod_50[1] + getitem_127 = submod_50[2] + getitem_128 = submod_50[3] + getitem_129 = submod_50[4]; submod_50 = None + submod_51 = self.submod_51(getitem_125, s72, getitem_126, getitem_127, getitem_128); getitem_125 = getitem_126 = getitem_127 = submod_51 = None + submod_52 = self.submod_52(getitem_128, s72, l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_, getitem_129, l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_128 = l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = getitem_129 = l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_130 = submod_52[0] + getitem_131 = submod_52[1] + getitem_132 = submod_52[2] + getitem_133 = submod_52[3] + getitem_134 = submod_52[4]; submod_52 = None + submod_53 = self.submod_53(getitem_130, s72, getitem_131, getitem_132, getitem_133); getitem_130 = getitem_131 = getitem_132 = submod_53 = None + submod_54 = self.submod_54(getitem_133, s72, l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_, getitem_134, l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_133 = l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = getitem_134 = l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_135 = submod_54[0] + getitem_136 = submod_54[1] + getitem_137 = submod_54[2] + getitem_138 = submod_54[3] + getitem_139 = submod_54[4]; submod_54 = None + submod_55 = self.submod_55(getitem_135, s72, getitem_136, getitem_137, getitem_138); getitem_135 = getitem_136 = getitem_137 = submod_55 = None + submod_56 = self.submod_56(getitem_138, s72, l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_, getitem_139, l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_138 = l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = getitem_139 = l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_140 = submod_56[0] + getitem_141 = submod_56[1] + getitem_142 = submod_56[2] + getitem_143 = submod_56[3] + getitem_144 = submod_56[4]; submod_56 = None + submod_57 = self.submod_57(getitem_140, s72, getitem_141, getitem_142, getitem_143); getitem_140 = getitem_141 = getitem_142 = submod_57 = None + submod_58 = self.submod_58(getitem_143, s72, l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_, getitem_144, l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_143 = l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = getitem_144 = l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_145 = submod_58[0] + getitem_146 = submod_58[1] + getitem_147 = submod_58[2] + getitem_148 = submod_58[3] + getitem_149 = submod_58[4]; submod_58 = None + submod_59 = self.submod_59(getitem_145, s72, getitem_146, getitem_147, getitem_148); getitem_145 = getitem_146 = getitem_147 = submod_59 = None + submod_60 = self.submod_60(getitem_148, s72, l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_, getitem_149, l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_148 = l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = getitem_149 = l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + getitem_150 = submod_60[0] + getitem_151 = submod_60[1] + getitem_152 = submod_60[2] + getitem_153 = submod_60[3] + getitem_154 = submod_60[4]; submod_60 = None + submod_61 = self.submod_61(getitem_150, s72, getitem_151, getitem_152, getitem_153); getitem_150 = getitem_151 = getitem_152 = submod_61 = None + submod_62 = self.submod_62(getitem_153, s72, l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_, getitem_154, l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_153 = l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = getitem_154 = l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = l_positions_ = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = None + getitem_155 = submod_62[0] + getitem_156 = submod_62[1] + getitem_157 = submod_62[2] + getitem_158 = submod_62[3] + getitem_159 = submod_62[4]; submod_62 = None + submod_63 = self.submod_63(getitem_155, s72, getitem_156, getitem_157, getitem_158); getitem_155 = getitem_156 = getitem_157 = submod_63 = None + submod_64 = self.submod_64(getitem_158, s72, l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_, getitem_159, l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_norm_parameters_weight_); getitem_158 = s72 = l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = getitem_159 = l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_norm_parameters_weight_ = None + return (submod_64,) + + class submod_0(torch.nn.Module): + def forward(self, l_input_ids_: "i32[s72]", s72: "Sym(s72)", l_self_modules_embed_tokens_parameters_weight_: "bf16[64128, 4096]", l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:167 in get_masked_input_and_mask, code: org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) + ge: "b8[s72]" = l_input_ids_ >= 0 + lt: "b8[s72]" = l_input_ids_ < 64128 + and_: "b8[s72]" = ge & lt; ge = lt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:168 in get_masked_input_and_mask, code: added_vocab_mask = (input_ >= added_vocab_start_index) & ( + ge_1: "b8[s72]" = l_input_ids_ >= 128256 + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:169 in get_masked_input_and_mask, code: input_ < added_vocab_end_index + lt_1: "b8[s72]" = l_input_ids_ < 128256 + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:168 in get_masked_input_and_mask, code: added_vocab_mask = (input_ >= added_vocab_start_index) & ( + and__1: "b8[s72]" = ge_1 & lt_1; ge_1 = lt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:176 in get_masked_input_and_mask, code: valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + mul: "i64[s72]" = 0 * and_ + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:177 in get_masked_input_and_mask, code: added_offset * added_vocab_mask + mul_1: "i64[s72]" = 64128 * and__1 + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:176 in get_masked_input_and_mask, code: valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + add: "i64[s72]" = mul + mul_1; mul = mul_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:179 in get_masked_input_and_mask, code: vocab_mask = org_vocab_mask | added_vocab_mask + or_: "b8[s72]" = and_ | and__1; and_ = and__1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:180 in get_masked_input_and_mask, code: input_ = vocab_mask * (input_ - valid_offset) + sub: "i64[s72]" = l_input_ids_ - add; l_input_ids_ = add = None + mul_2: "i64[s72]" = or_ * sub; sub = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:181 in get_masked_input_and_mask, code: return input_, ~vocab_mask + invert: "b8[s72]" = ~or_; or_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:475 in forward_native, code: output_parallel = self.quant_method.embedding(self, masked_input.long()) + long: "i64[s72]" = mul_2.long(); mul_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:72 in embedding, code: return F.embedding(input_, layer.weight) + embedding: "bf16[s72, 4096]" = torch.nn.functional.embedding(long, l_self_modules_embed_tokens_parameters_weight_); long = l_self_modules_embed_tokens_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:478 in forward_native, code: output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) + unsqueeze: "b8[s72, 1]" = invert.unsqueeze(-1); invert = None + masked_fill_: "bf16[s72, 4096]" = embedding.masked_fill_(unsqueeze, 0); unsqueeze = masked_fill_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(embedding, group_name = 'tp:0'); embedding = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = to.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul_3: "f32[s72, 4096]" = to * rsqrt; to = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_1 * _get_data_attr; to_1 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear.split([2048, 512, 512], dim = -1); linear = None + getitem: "bf16[s72, 2048]" = split[0] + getitem_1: "bf16[s72, 512]" = split[1] + getitem_2: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_3: "bf16[s72, 64]" = chunk[0] + getitem_4: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view: "bf16[s72, 16, 128]" = getitem.view(s72, -1, 128); getitem = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_5: "bf16[s72, 16, 128]" = view[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_6: "bf16[s72, 16, 0]" = view[(Ellipsis, slice(128, None, None))]; view = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_3.unsqueeze(-2) + to_2: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_4.unsqueeze(-2) + to_3: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_5, 2, dim = -1); getitem_5 = None + getitem_7: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_8: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_7 * to_2 + mul_6: "bf16[s72, 16, 64]" = getitem_8 * to_3 + sub_1: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_8 * to_2; getitem_8 = to_2 = None + mul_8: "bf16[s72, 16, 64]" = getitem_7 * to_3; getitem_7 = to_3 = None + add_2: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub_1, add_2), dim = -1); sub_1 = add_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_6), dim = -1); cat = getitem_6 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_1.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 4, 128]" = getitem_1.view(s72, -1, 128); getitem_1 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_9: "bf16[s72, 4, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_10: "bf16[s72, 4, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_3.unsqueeze(-2); getitem_3 = None + to_4: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_4: "bf16[s72, 1, 64]" = getitem_4.unsqueeze(-2); getitem_4 = None + to_5: "bf16[s72, 1, 64]" = unsqueeze_4.to(torch.bfloat16); unsqueeze_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_9, 2, dim = -1); getitem_9 = None + getitem_11: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_12: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_11 * to_4 + mul_10: "bf16[s72, 4, 64]" = getitem_12 * to_5 + sub_2: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_12 * to_4; getitem_12 = to_4 = None + mul_12: "bf16[s72, 4, 64]" = getitem_11 * to_5; getitem_11 = to_5 = None + add_3: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_2, add_3), dim = -1); sub_2 = add_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_10), dim = -1); cat_2 = getitem_10 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_2: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_4: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = getitem_2.view(-1, 4, 128); getitem_2 = None + return (view_2, view_4, view_5, view_3, all_reduce) + + class submod_1(torch.nn.Module): + def forward(self, query_2: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_2: "bf16[s72, 4, 128]", value: "bf16[s72, 4, 128]", output_4: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_2, key_2, value, output_4, 'model.layers.0.self_attn.attn'); query_2 = key_2 = value = output_4 = unified_attention_with_output = None + return () + + class submod_2(torch.nn.Module): + def forward(self, output_4: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", output: "bf16[s72, 4096]", l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_4.view(-1, 2048); output_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + output; to = output = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_3(torch.nn.Module): + def forward(self, query_5: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_5: "bf16[s72, 4, 128]", value_1: "bf16[s72, 4, 128]", output_10: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_5, key_5, value_1, output_10, 'model.layers.1.self_attn.attn'); query_5 = key_5 = value_1 = output_10 = unified_attention_with_output = None + return () + + class submod_4(torch.nn.Module): + def forward(self, output_10: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_1: "bf16[s72, 4096]", l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_10.view(-1, 2048); output_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_1; to = residual_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_5(torch.nn.Module): + def forward(self, query_8: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_8: "bf16[s72, 4, 128]", value_2: "bf16[s72, 4, 128]", output_16: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_8, key_8, value_2, output_16, 'model.layers.2.self_attn.attn'); query_8 = key_8 = value_2 = output_16 = unified_attention_with_output = None + return () + + class submod_6(torch.nn.Module): + def forward(self, output_16: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_3: "bf16[s72, 4096]", l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_16.view(-1, 2048); output_16 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_3; to = residual_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_7(torch.nn.Module): + def forward(self, query_11: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_11: "bf16[s72, 4, 128]", value_3: "bf16[s72, 4, 128]", output_22: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_11, key_11, value_3, output_22, 'model.layers.3.self_attn.attn'); query_11 = key_11 = value_3 = output_22 = unified_attention_with_output = None + return () + + class submod_8(torch.nn.Module): + def forward(self, output_22: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_5: "bf16[s72, 4096]", l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_22.view(-1, 2048); output_22 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_5; to = residual_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_9(torch.nn.Module): + def forward(self, query_14: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_14: "bf16[s72, 4, 128]", value_4: "bf16[s72, 4, 128]", output_28: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_14, key_14, value_4, output_28, 'model.layers.4.self_attn.attn'); query_14 = key_14 = value_4 = output_28 = unified_attention_with_output = None + return () + + class submod_10(torch.nn.Module): + def forward(self, output_28: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_7: "bf16[s72, 4096]", l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_28.view(-1, 2048); output_28 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_7; to = residual_7 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_11(torch.nn.Module): + def forward(self, query_17: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_17: "bf16[s72, 4, 128]", value_5: "bf16[s72, 4, 128]", output_34: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_17, key_17, value_5, output_34, 'model.layers.5.self_attn.attn'); query_17 = key_17 = value_5 = output_34 = unified_attention_with_output = None + return () + + class submod_12(torch.nn.Module): + def forward(self, output_34: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_9: "bf16[s72, 4096]", l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_34.view(-1, 2048); output_34 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_9; to = residual_9 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_13(torch.nn.Module): + def forward(self, query_20: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_20: "bf16[s72, 4, 128]", value_6: "bf16[s72, 4, 128]", output_40: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_20, key_20, value_6, output_40, 'model.layers.6.self_attn.attn'); query_20 = key_20 = value_6 = output_40 = unified_attention_with_output = None + return () + + class submod_14(torch.nn.Module): + def forward(self, output_40: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_11: "bf16[s72, 4096]", l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_40.view(-1, 2048); output_40 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_11; to = residual_11 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_15(torch.nn.Module): + def forward(self, query_23: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_23: "bf16[s72, 4, 128]", value_7: "bf16[s72, 4, 128]", output_46: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_23, key_23, value_7, output_46, 'model.layers.7.self_attn.attn'); query_23 = key_23 = value_7 = output_46 = unified_attention_with_output = None + return () + + class submod_16(torch.nn.Module): + def forward(self, output_46: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_13: "bf16[s72, 4096]", l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_46.view(-1, 2048); output_46 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_13; to = residual_13 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_17(torch.nn.Module): + def forward(self, query_26: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_26: "bf16[s72, 4, 128]", value_8: "bf16[s72, 4, 128]", output_52: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_26, key_26, value_8, output_52, 'model.layers.8.self_attn.attn'); query_26 = key_26 = value_8 = output_52 = unified_attention_with_output = None + return () + + class submod_18(torch.nn.Module): + def forward(self, output_52: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_15: "bf16[s72, 4096]", l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_52.view(-1, 2048); output_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_15; to = residual_15 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_19(torch.nn.Module): + def forward(self, query_29: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_29: "bf16[s72, 4, 128]", value_9: "bf16[s72, 4, 128]", output_58: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_29, key_29, value_9, output_58, 'model.layers.9.self_attn.attn'); query_29 = key_29 = value_9 = output_58 = unified_attention_with_output = None + return () + + class submod_20(torch.nn.Module): + def forward(self, output_58: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_17: "bf16[s72, 4096]", l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_58.view(-1, 2048); output_58 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_17; to = residual_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_21(torch.nn.Module): + def forward(self, query_32: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_32: "bf16[s72, 4, 128]", value_10: "bf16[s72, 4, 128]", output_64: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_32, key_32, value_10, output_64, 'model.layers.10.self_attn.attn'); query_32 = key_32 = value_10 = output_64 = unified_attention_with_output = None + return () + + class submod_22(torch.nn.Module): + def forward(self, output_64: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_19: "bf16[s72, 4096]", l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_64.view(-1, 2048); output_64 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_19; to = residual_19 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_23(torch.nn.Module): + def forward(self, query_35: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_35: "bf16[s72, 4, 128]", value_11: "bf16[s72, 4, 128]", output_70: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_35, key_35, value_11, output_70, 'model.layers.11.self_attn.attn'); query_35 = key_35 = value_11 = output_70 = unified_attention_with_output = None + return () + + class submod_24(torch.nn.Module): + def forward(self, output_70: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_21: "bf16[s72, 4096]", l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_70.view(-1, 2048); output_70 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_21; to = residual_21 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_25(torch.nn.Module): + def forward(self, query_38: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_38: "bf16[s72, 4, 128]", value_12: "bf16[s72, 4, 128]", output_76: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_38, key_38, value_12, output_76, 'model.layers.12.self_attn.attn'); query_38 = key_38 = value_12 = output_76 = unified_attention_with_output = None + return () + + class submod_26(torch.nn.Module): + def forward(self, output_76: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_23: "bf16[s72, 4096]", l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_76.view(-1, 2048); output_76 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_23; to = residual_23 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_27(torch.nn.Module): + def forward(self, query_41: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_41: "bf16[s72, 4, 128]", value_13: "bf16[s72, 4, 128]", output_82: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_41, key_41, value_13, output_82, 'model.layers.13.self_attn.attn'); query_41 = key_41 = value_13 = output_82 = unified_attention_with_output = None + return () + + class submod_28(torch.nn.Module): + def forward(self, output_82: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_25: "bf16[s72, 4096]", l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_82.view(-1, 2048); output_82 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_25; to = residual_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_29(torch.nn.Module): + def forward(self, query_44: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_44: "bf16[s72, 4, 128]", value_14: "bf16[s72, 4, 128]", output_88: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_44, key_44, value_14, output_88, 'model.layers.14.self_attn.attn'); query_44 = key_44 = value_14 = output_88 = unified_attention_with_output = None + return () + + class submod_30(torch.nn.Module): + def forward(self, output_88: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_27: "bf16[s72, 4096]", l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_88.view(-1, 2048); output_88 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_27; to = residual_27 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_31(torch.nn.Module): + def forward(self, query_47: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_47: "bf16[s72, 4, 128]", value_15: "bf16[s72, 4, 128]", output_94: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_47, key_47, value_15, output_94, 'model.layers.15.self_attn.attn'); query_47 = key_47 = value_15 = output_94 = unified_attention_with_output = None + return () + + class submod_32(torch.nn.Module): + def forward(self, output_94: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_29: "bf16[s72, 4096]", l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_94.view(-1, 2048); output_94 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_29; to = residual_29 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_33(torch.nn.Module): + def forward(self, query_50: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_50: "bf16[s72, 4, 128]", value_16: "bf16[s72, 4, 128]", output_100: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_50, key_50, value_16, output_100, 'model.layers.16.self_attn.attn'); query_50 = key_50 = value_16 = output_100 = unified_attention_with_output = None + return () + + class submod_34(torch.nn.Module): + def forward(self, output_100: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_31: "bf16[s72, 4096]", l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_100.view(-1, 2048); output_100 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_31; to = residual_31 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_35(torch.nn.Module): + def forward(self, query_53: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_53: "bf16[s72, 4, 128]", value_17: "bf16[s72, 4, 128]", output_106: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_53, key_53, value_17, output_106, 'model.layers.17.self_attn.attn'); query_53 = key_53 = value_17 = output_106 = unified_attention_with_output = None + return () + + class submod_36(torch.nn.Module): + def forward(self, output_106: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_33: "bf16[s72, 4096]", l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_106.view(-1, 2048); output_106 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_33; to = residual_33 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_37(torch.nn.Module): + def forward(self, query_56: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_56: "bf16[s72, 4, 128]", value_18: "bf16[s72, 4, 128]", output_112: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_56, key_56, value_18, output_112, 'model.layers.18.self_attn.attn'); query_56 = key_56 = value_18 = output_112 = unified_attention_with_output = None + return () + + class submod_38(torch.nn.Module): + def forward(self, output_112: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_35: "bf16[s72, 4096]", l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_112.view(-1, 2048); output_112 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_35; to = residual_35 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_39(torch.nn.Module): + def forward(self, query_59: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_59: "bf16[s72, 4, 128]", value_19: "bf16[s72, 4, 128]", output_118: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_59, key_59, value_19, output_118, 'model.layers.19.self_attn.attn'); query_59 = key_59 = value_19 = output_118 = unified_attention_with_output = None + return () + + class submod_40(torch.nn.Module): + def forward(self, output_118: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_37: "bf16[s72, 4096]", l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_118.view(-1, 2048); output_118 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_37; to = residual_37 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_41(torch.nn.Module): + def forward(self, query_62: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_62: "bf16[s72, 4, 128]", value_20: "bf16[s72, 4, 128]", output_124: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_62, key_62, value_20, output_124, 'model.layers.20.self_attn.attn'); query_62 = key_62 = value_20 = output_124 = unified_attention_with_output = None + return () + + class submod_42(torch.nn.Module): + def forward(self, output_124: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_39: "bf16[s72, 4096]", l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_124.view(-1, 2048); output_124 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_39; to = residual_39 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_43(torch.nn.Module): + def forward(self, query_65: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_65: "bf16[s72, 4, 128]", value_21: "bf16[s72, 4, 128]", output_130: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_65, key_65, value_21, output_130, 'model.layers.21.self_attn.attn'); query_65 = key_65 = value_21 = output_130 = unified_attention_with_output = None + return () + + class submod_44(torch.nn.Module): + def forward(self, output_130: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_41: "bf16[s72, 4096]", l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_130.view(-1, 2048); output_130 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_41; to = residual_41 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_45(torch.nn.Module): + def forward(self, query_68: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_68: "bf16[s72, 4, 128]", value_22: "bf16[s72, 4, 128]", output_136: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_68, key_68, value_22, output_136, 'model.layers.22.self_attn.attn'); query_68 = key_68 = value_22 = output_136 = unified_attention_with_output = None + return () + + class submod_46(torch.nn.Module): + def forward(self, output_136: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_43: "bf16[s72, 4096]", l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_136.view(-1, 2048); output_136 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_43; to = residual_43 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_47(torch.nn.Module): + def forward(self, query_71: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_71: "bf16[s72, 4, 128]", value_23: "bf16[s72, 4, 128]", output_142: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_71, key_71, value_23, output_142, 'model.layers.23.self_attn.attn'); query_71 = key_71 = value_23 = output_142 = unified_attention_with_output = None + return () + + class submod_48(torch.nn.Module): + def forward(self, output_142: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_45: "bf16[s72, 4096]", l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_142.view(-1, 2048); output_142 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_45; to = residual_45 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_49(torch.nn.Module): + def forward(self, query_74: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_74: "bf16[s72, 4, 128]", value_24: "bf16[s72, 4, 128]", output_148: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_74, key_74, value_24, output_148, 'model.layers.24.self_attn.attn'); query_74 = key_74 = value_24 = output_148 = unified_attention_with_output = None + return () + + class submod_50(torch.nn.Module): + def forward(self, output_148: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_47: "bf16[s72, 4096]", l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_148.view(-1, 2048); output_148 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_47; to = residual_47 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_51(torch.nn.Module): + def forward(self, query_77: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_77: "bf16[s72, 4, 128]", value_25: "bf16[s72, 4, 128]", output_154: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_77, key_77, value_25, output_154, 'model.layers.25.self_attn.attn'); query_77 = key_77 = value_25 = output_154 = unified_attention_with_output = None + return () + + class submod_52(torch.nn.Module): + def forward(self, output_154: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_49: "bf16[s72, 4096]", l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_154.view(-1, 2048); output_154 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_49; to = residual_49 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_53(torch.nn.Module): + def forward(self, query_80: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_80: "bf16[s72, 4, 128]", value_26: "bf16[s72, 4, 128]", output_160: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_80, key_80, value_26, output_160, 'model.layers.26.self_attn.attn'); query_80 = key_80 = value_26 = output_160 = unified_attention_with_output = None + return () + + class submod_54(torch.nn.Module): + def forward(self, output_160: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_51: "bf16[s72, 4096]", l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_160.view(-1, 2048); output_160 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_51; to = residual_51 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_55(torch.nn.Module): + def forward(self, query_83: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_83: "bf16[s72, 4, 128]", value_27: "bf16[s72, 4, 128]", output_166: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_83, key_83, value_27, output_166, 'model.layers.27.self_attn.attn'); query_83 = key_83 = value_27 = output_166 = unified_attention_with_output = None + return () + + class submod_56(torch.nn.Module): + def forward(self, output_166: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_53: "bf16[s72, 4096]", l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_166.view(-1, 2048); output_166 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_53; to = residual_53 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_57(torch.nn.Module): + def forward(self, query_86: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_86: "bf16[s72, 4, 128]", value_28: "bf16[s72, 4, 128]", output_172: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_86, key_86, value_28, output_172, 'model.layers.28.self_attn.attn'); query_86 = key_86 = value_28 = output_172 = unified_attention_with_output = None + return () + + class submod_58(torch.nn.Module): + def forward(self, output_172: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_55: "bf16[s72, 4096]", l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_172.view(-1, 2048); output_172 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_55; to = residual_55 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_59(torch.nn.Module): + def forward(self, query_89: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_89: "bf16[s72, 4, 128]", value_29: "bf16[s72, 4, 128]", output_178: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_89, key_89, value_29, output_178, 'model.layers.29.self_attn.attn'); query_89 = key_89 = value_29 = output_178 = unified_attention_with_output = None + return () + + class submod_60(torch.nn.Module): + def forward(self, output_178: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_57: "bf16[s72, 4096]", l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_178.view(-1, 2048); output_178 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_57; to = residual_57 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_61(torch.nn.Module): + def forward(self, query_92: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_92: "bf16[s72, 4, 128]", value_30: "bf16[s72, 4, 128]", output_184: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_92, key_92, value_30, output_184, 'model.layers.30.self_attn.attn'); query_92 = key_92 = value_30 = output_184 = unified_attention_with_output = None + return () + + class submod_62(torch.nn.Module): + def forward(self, output_184: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_59: "bf16[s72, 4096]", l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_184.view(-1, 2048); output_184 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_59; to = residual_59 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + + class submod_63(torch.nn.Module): + def forward(self, query_95: "bf16[s72, 16, 128]", s72: "Sym(s72)", key_95: "bf16[s72, 4, 128]", value_31: "bf16[s72, 4, 128]", output_190: "bf16[s72, 16, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:347 in forward, code: torch.ops.vllm.unified_attention_with_output( + unified_attention_with_output = torch.ops.vllm.unified_attention_with_output(query_95, key_95, value_31, output_190, 'model.layers.31.self_attn.attn'); query_95 = key_95 = value_31 = output_190 = unified_attention_with_output = None + return () + + class submod_64(torch.nn.Module): + def forward(self, output_190: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", residual_61: "bf16[s72, 4096]", l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_norm_parameters_weight_: "bf16[4096]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_190.view(-1, 2048); output_190 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + residual_61; to = residual_61 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_norm_parameters_weight_); l_self_modules_norm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16); to_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + return mul_4 + +V0127 17:17:47.272000 1175001 site-packages/torch/_dynamo/guards.py:3778] {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "0bb29cba30b0818d498a696ba892cc71"} + + TREE_GUARD_MANAGER: + +- RootGuardManager + + Guard latency = 6.20 us +V0127 17:17:47.294000 1175001 site-packages/torch/_dynamo/utils.py:1724] {"compilation_metrics": {"compile_id": "0/0", "frame_key": "1", "co_name": "forward", "co_filename": "/data/users/angelayi/vllm/vllm/model_executor/models/llama.py", "co_firstlineno": 412, "cache_size": 0, "accumulated_cache_size": 0, "guard_count": 6810, "shape_env_guard_count": 3, "graph_op_count": 2875, "graph_node_count": 3075, "graph_input_count": 199, "start_time": 1769563061.181114, "entire_frame_compile_time_s": 6.111051, "backend_compile_time_s": 1.818307, "inductor_compile_time_s": null, "code_gen_time_s": null, "fail_type": null, "fail_reason": null, "fail_user_frame_filename": null, "fail_user_frame_lineno": null, "non_compliant_ops": ["vllm::all_reduce", "vllm::unified_attention_with_output"], "compliant_custom_ops": [], "restart_reasons": [], "dynamo_time_before_restart_s": 0.0, "stack_trace": ["Line: 1, Name: , Filename: ", "Line: 122, Name: spawn_main, Filename: /home/angelayi/.conda/envs/vllm-2.10/lib/python3.12/multiprocessing/spawn.py", "Line: 135, Name: _main, Filename: /home/angelayi/.conda/envs/vllm-2.10/lib/python3.12/multiprocessing/spawn.py", "Line: 314, Name: _bootstrap, Filename: /home/angelayi/.conda/envs/vllm-2.10/lib/python3.12/multiprocessing/process.py", "Line: 108, Name: run, Filename: /home/angelayi/.conda/envs/vllm-2.10/lib/python3.12/multiprocessing/process.py", "Line: 742, Name: worker_main, Filename: /data/users/angelayi/vllm/vllm/v1/executor/multiproc_executor.py", "Line: 819, Name: worker_busy_loop, Filename: /data/users/angelayi/vllm/vllm/v1/executor/multiproc_executor.py", "Line: 124, Name: decorate_context, Filename: /home/angelayi/.conda/envs/vllm-2.10/lib/python3.12/site-packages/torch/utils/_contextlib.py", "Line: 340, Name: determine_available_memory, Filename: /data/users/angelayi/vllm/vllm/v1/worker/gpu_worker.py", "Line: 4516, Name: profile_run, Filename: /data/users/angelayi/vllm/vllm/v1/worker/gpu_model_runner.py", "Line: 124, Name: decorate_context, Filename: /home/angelayi/.conda/envs/vllm-2.10/lib/python3.12/site-packages/torch/utils/_contextlib.py", "Line: 4217, Name: _dummy_run, Filename: /data/users/angelayi/vllm/vllm/v1/worker/gpu_model_runner.py", "Line: 220, Name: __call__, Filename: /data/users/angelayi/vllm/vllm/compilation/cuda_graph.py", "Line: 1776, Name: _wrapped_call_impl, Filename: /home/angelayi/.conda/envs/vllm-2.10/lib/python3.12/site-packages/torch/nn/modules/module.py", "Line: 1787, Name: _call_impl, Filename: /home/angelayi/.conda/envs/vllm-2.10/lib/python3.12/site-packages/torch/nn/modules/module.py", "Line: 623, Name: forward, Filename: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py", "Line: 526, Name: __call__, Filename: /data/users/angelayi/vllm/vllm/compilation/decorators.py", "Line: 218, Name: __call__, Filename: /data/users/angelayi/vllm/vllm/compilation/wrapper.py", "Line: 412, Name: forward, Filename: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py"], "exception_stack_trace": null, "graph_node_shapes": "{'l_input_ids_': ['s72'], 'l_self_modules_embed_tokens_parameters_weight_': [64128, 4096], 'l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_': [131072, 128], 'l_positions_': ['s72'], 'l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_1_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_1_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_1_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_1_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_2_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_2_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_2_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_2_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_2_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_2_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_3_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_3_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_3_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_3_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_3_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_3_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_4_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_4_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_4_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_4_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_4_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_4_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_5_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_5_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_5_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_5_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_5_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_5_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_6_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_6_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_6_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_6_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_6_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_6_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_7_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_7_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_7_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_7_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_7_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_7_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_8_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_8_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_8_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_8_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_8_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_8_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_9_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_9_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_9_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_9_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_9_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_9_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_10_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_10_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_10_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_10_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_10_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_10_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_11_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_11_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_11_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_11_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_11_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_11_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_12_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_12_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_12_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_12_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_12_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_12_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_13_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_13_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_13_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_13_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_13_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_13_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_14_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_14_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_14_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_14_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_14_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_14_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_15_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_15_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_15_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_15_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_15_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_15_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_16_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_16_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_16_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_16_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_16_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_16_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_17_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_17_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_17_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_17_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_17_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_17_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_18_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_18_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_18_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_18_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_18_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_18_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_19_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_19_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_19_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_19_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_19_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_19_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_20_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_20_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_20_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_20_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_20_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_20_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_21_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_21_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_21_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_21_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_21_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_21_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_22_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_22_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_22_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_22_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_22_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_22_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_23_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_23_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_23_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_23_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_23_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_23_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_24_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_24_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_24_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_24_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_24_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_24_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_25_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_25_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_25_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_25_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_25_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_25_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_26_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_26_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_26_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_26_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_26_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_26_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_27_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_27_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_27_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_27_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_27_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_27_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_28_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_28_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_28_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_28_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_28_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_28_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_29_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_29_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_29_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_29_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_29_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_29_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_30_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_30_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_30_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_30_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_30_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_30_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_layers_modules_31_modules_input_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_31_modules_self_attn_modules_qkv_proj_parameters_weight_': [3072, 4096], 'l_self_modules_layers_modules_31_modules_self_attn_modules_o_proj_parameters_weight_': [4096, 2048], 'l_self_modules_layers_modules_31_modules_post_attention_layernorm_parameters_weight_': [4096], 'l_self_modules_layers_modules_31_modules_mlp_modules_gate_up_proj_parameters_weight_': [14336, 4096], 'l_self_modules_layers_modules_31_modules_mlp_modules_down_proj_parameters_weight_': [4096, 7168], 'l_self_modules_norm_parameters_weight_': [4096], 'ge': ['s72'], 'lt': ['s72'], 'org_vocab_mask': ['s72'], 'ge_1': ['s72'], 'lt_1': ['s72'], 'added_vocab_mask': ['s72'], 'mul': ['s72'], 'mul_1': ['s72'], 'valid_offset': ['s72'], 'vocab_mask': ['s72'], 'sub': ['s72'], 'input_': ['s72'], 'input_mask': ['s72'], 'long': ['s72'], 'output_parallel': ['s72', 4096], 'unsqueeze': ['s72', 1], 'masked_fill_': ['s72', 4096], 'output': ['s72', 4096], '_get_data_attr': [4096], 'x': ['s72', 4096], 'pow_1': ['s72', 4096], 'variance': ['s72', 1], 'add_1': ['s72', 1], 'rsqrt': ['s72', 1], 'x_1': ['s72', 4096], 'x_2': ['s72', 4096], 'x_3': ['s72', 4096], 'output_parallel_1': ['s72', 3072], 'q': ['s72', 2048], 'k': ['s72', 512], 'v': ['s72', 512], 'positions': ['s72'], 'cos_sin': ['s72', 128], 'cos': ['s72', 64], 'sin': ['s72', 64], 'query': ['s72', 16, 128], 'query_rot': ['s72', 16, 128], 'query_pass': ['s72', 16, 0], 'unsqueeze_1': ['s72', 1, 64], 'cos_1': ['s72', 1, 64], 'unsqueeze_2': ['s72', 1, 64], 'sin_1': ['s72', 1, 64], 'x1': ['s72', 16, 64], 'x2': ['s72', 16, 64], 'mul_5': ['s72', 16, 64], 'mul_6': ['s72', 16, 64], 'o1': ['s72', 16, 64], 'mul_7': ['s72', 16, 64], 'mul_8': ['s72', 16, 64], 'o2': ['s72', 16, 64], 'output_1': ['s72', 16, 128], 'cat_1': ['s72', 16, 128], 'query_1': ['s72', 2048], 'key': ['s72', 4, 128], 'key_rot': ['s72', 4, 128], 'key_pass': ['s72', 4, 0], 'unsqueeze_3': ['s72', 1, 64], 'cos_2': ['s72', 1, 64], 'unsqueeze_4': ['s72', 1, 64], 'sin_2': ['s72', 1, 64], 'x1_1': ['s72', 4, 64], 'x2_1': ['s72', 4, 64], 'mul_9': ['s72', 4, 64], 'mul_10': ['s72', 4, 64], 'o1_1': ['s72', 4, 64], 'mul_11': ['s72', 4, 64], 'mul_12': ['s72', 4, 64], 'o2_1': ['s72', 4, 64], 'output_2': ['s72', 4, 128], 'cat_3': ['s72', 4, 128], 'key_1': ['s72', 512], 'output_3': ['s72', 2048], 'query_2': ['s72', 16, 128], 'output_4': ['s72', 16, 128], 'key_2': ['s72', 4, 128], 'value': ['s72', 4, 128], 'attn_output': ['s72', 2048], 'output_parallel_2': ['s72', 4096], 'output_5': ['s72', 4096], '_get_data_attr_1': [4096], 'x_4': ['s72', 4096], 'x_5': ['s72', 4096], 'residual': ['s72', 4096], 'pow_2': ['s72', 4096], 'variance_1': ['s72', 1], 'add_5': ['s72', 1], 'rsqrt_1': ['s72', 1], 'x_6': ['s72', 4096], 'x_7': ['s72', 4096], 'x_8': ['s72', 4096], 'output_parallel_3': ['s72', 14336], 'getitem_26': ['s72', 7168], 'silu': ['s72', 7168], 'getitem_27': ['s72', 7168], 'x_9': ['s72', 7168], 'output_parallel_4': ['s72', 4096], 'output_6': ['s72', 4096], '_get_data_attr_2': [4096], 'x_10': ['s72', 4096], 'x_11': ['s72', 4096], 'residual_1': ['s72', 4096], 'pow_3': ['s72', 4096], 'variance_2': ['s72', 1], 'add_7': ['s72', 1], 'rsqrt_2': ['s72', 1], 'x_12': ['s72', 4096], 'x_13': ['s72', 4096], 'x_14': ['s72', 4096], 'output_parallel_5': ['s72', 3072], 'q_1': ['s72', 2048], 'k_1': ['s72', 512], 'v_1': ['s72', 512], 'positions_1': ['s72'], 'cos_sin_1': ['s72', 128], 'cos_3': ['s72', 64], 'sin_3': ['s72', 64], 'query_3': ['s72', 16, 128], 'query_rot_1': ['s72', 16, 128], 'query_pass_1': ['s72', 16, 0], 'unsqueeze_5': ['s72', 1, 64], 'cos_4': ['s72', 1, 64], 'unsqueeze_6': ['s72', 1, 64], 'sin_4': ['s72', 1, 64], 'x1_2': ['s72', 16, 64], 'x2_2': ['s72', 16, 64], 'mul_18': ['s72', 16, 64], 'mul_19': ['s72', 16, 64], 'o1_2': ['s72', 16, 64], 'mul_20': ['s72', 16, 64], 'mul_21': ['s72', 16, 64], 'o2_2': ['s72', 16, 64], 'output_7': ['s72', 16, 128], 'cat_5': ['s72', 16, 128], 'query_4': ['s72', 2048], 'key_3': ['s72', 4, 128], 'key_rot_1': ['s72', 4, 128], 'key_pass_1': ['s72', 4, 0], 'unsqueeze_7': ['s72', 1, 64], 'cos_5': ['s72', 1, 64], 'unsqueeze_8': ['s72', 1, 64], 'sin_5': ['s72', 1, 64], 'x1_3': ['s72', 4, 64], 'x2_3': ['s72', 4, 64], 'mul_22': ['s72', 4, 64], 'mul_23': ['s72', 4, 64], 'o1_3': ['s72', 4, 64], 'mul_24': ['s72', 4, 64], 'mul_25': ['s72', 4, 64], 'o2_3': ['s72', 4, 64], 'output_8': ['s72', 4, 128], 'cat_7': ['s72', 4, 128], 'key_4': ['s72', 512], 'output_9': ['s72', 2048], 'query_5': ['s72', 16, 128], 'output_10': ['s72', 16, 128], 'key_5': ['s72', 4, 128], 'value_1': ['s72', 4, 128], 'attn_output_1': ['s72', 2048], 'output_parallel_6': ['s72', 4096], 'output_11': ['s72', 4096], '_get_data_attr_3': [4096], 'x_15': ['s72', 4096], 'x_16': ['s72', 4096], 'residual_2': ['s72', 4096], 'pow_4': ['s72', 4096], 'variance_3': ['s72', 1], 'add_11': ['s72', 1], 'rsqrt_3': ['s72', 1], 'x_17': ['s72', 4096], 'x_18': ['s72', 4096], 'x_19': ['s72', 4096], 'output_parallel_7': ['s72', 14336], 'getitem_54': ['s72', 7168], 'silu_1': ['s72', 7168], 'getitem_55': ['s72', 7168], 'x_20': ['s72', 7168], 'output_parallel_8': ['s72', 4096], 'output_12': ['s72', 4096], '_get_data_attr_4': [4096], 'x_21': ['s72', 4096], 'x_22': ['s72', 4096], 'residual_3': ['s72', 4096], 'pow_5': ['s72', 4096], 'variance_4': ['s72', 1], 'add_13': ['s72', 1], 'rsqrt_4': ['s72', 1], 'x_23': ['s72', 4096], 'x_24': ['s72', 4096], 'x_25': ['s72', 4096], 'output_parallel_9': ['s72', 3072], 'q_2': ['s72', 2048], 'k_2': ['s72', 512], 'v_2': ['s72', 512], 'positions_2': ['s72'], 'cos_sin_2': ['s72', 128], 'cos_6': ['s72', 64], 'sin_6': ['s72', 64], 'query_6': ['s72', 16, 128], 'query_rot_2': ['s72', 16, 128], 'query_pass_2': ['s72', 16, 0], 'unsqueeze_9': ['s72', 1, 64], 'cos_7': ['s72', 1, 64], 'unsqueeze_10': ['s72', 1, 64], 'sin_7': ['s72', 1, 64], 'x1_4': ['s72', 16, 64], 'x2_4': ['s72', 16, 64], 'mul_31': ['s72', 16, 64], 'mul_32': ['s72', 16, 64], 'o1_4': ['s72', 16, 64], 'mul_33': ['s72', 16, 64], 'mul_34': ['s72', 16, 64], 'o2_4': ['s72', 16, 64], 'output_13': ['s72', 16, 128], 'cat_9': ['s72', 16, 128], 'query_7': ['s72', 2048], 'key_6': ['s72', 4, 128], 'key_rot_2': ['s72', 4, 128], 'key_pass_2': ['s72', 4, 0], 'unsqueeze_11': ['s72', 1, 64], 'cos_8': ['s72', 1, 64], 'unsqueeze_12': ['s72', 1, 64], 'sin_8': ['s72', 1, 64], 'x1_5': ['s72', 4, 64], 'x2_5': ['s72', 4, 64], 'mul_35': ['s72', 4, 64], 'mul_36': ['s72', 4, 64], 'o1_5': ['s72', 4, 64], 'mul_37': ['s72', 4, 64], 'mul_38': ['s72', 4, 64], 'o2_5': ['s72', 4, 64], 'output_14': ['s72', 4, 128], 'cat_11': ['s72', 4, 128], 'key_7': ['s72', 512], 'output_15': ['s72', 2048], 'query_8': ['s72', 16, 128], 'output_16': ['s72', 16, 128], 'key_8': ['s72', 4, 128], 'value_2': ['s72', 4, 128], 'attn_output_2': ['s72', 2048], 'output_parallel_10': ['s72', 4096], 'output_17': ['s72', 4096], '_get_data_attr_5': [4096], 'x_26': ['s72', 4096], 'x_27': ['s72', 4096], 'residual_4': ['s72', 4096], 'pow_6': ['s72', 4096], 'variance_5': ['s72', 1], 'add_17': ['s72', 1], 'rsqrt_5': ['s72', 1], 'x_28': ['s72', 4096], 'x_29': ['s72', 4096], 'x_30': ['s72', 4096], 'output_parallel_11': ['s72', 14336], 'getitem_82': ['s72', 7168], 'silu_2': ['s72', 7168], 'getitem_83': ['s72', 7168], 'x_31': ['s72', 7168], 'output_parallel_12': ['s72', 4096], 'output_18': ['s72', 4096], '_get_data_attr_6': [4096], 'x_32': ['s72', 4096], 'x_33': ['s72', 4096], 'residual_5': ['s72', 4096], 'pow_7': ['s72', 4096], 'variance_6': ['s72', 1], 'add_19': ['s72', 1], 'rsqrt_6': ['s72', 1], 'x_34': ['s72', 4096], 'x_35': ['s72', 4096], 'x_36': ['s72', 4096], 'output_parallel_13': ['s72', 3072], 'q_3': ['s72', 2048], 'k_3': ['s72', 512], 'v_3': ['s72', 512], 'positions_3': ['s72'], 'cos_sin_3': ['s72', 128], 'cos_9': ['s72', 64], 'sin_9': ['s72', 64], 'query_9': ['s72', 16, 128], 'query_rot_3': ['s72', 16, 128], 'query_pass_3': ['s72', 16, 0], 'unsqueeze_13': ['s72', 1, 64], 'cos_10': ['s72', 1, 64], 'unsqueeze_14': ['s72', 1, 64], 'sin_10': ['s72', 1, 64], 'x1_6': ['s72', 16, 64], 'x2_6': ['s72', 16, 64], 'mul_44': ['s72', 16, 64], 'mul_45': ['s72', 16, 64], 'o1_6': ['s72', 16, 64], 'mul_46': ['s72', 16, 64], 'mul_47': ['s72', 16, 64], 'o2_6': ['s72', 16, 64], 'output_19': ['s72', 16, 128], 'cat_13': ['s72', 16, 128], 'query_10': ['s72', 2048], 'key_9': ['s72', 4, 128], 'key_rot_3': ['s72', 4, 128], 'key_pass_3': ['s72', 4, 0], 'unsqueeze_15': ['s72', 1, 64], 'cos_11': ['s72', 1, 64], 'unsqueeze_16': ['s72', 1, 64], 'sin_11': ['s72', 1, 64], 'x1_7': ['s72', 4, 64], 'x2_7': ['s72', 4, 64], 'mul_48': ['s72', 4, 64], 'mul_49': ['s72', 4, 64], 'o1_7': ['s72', 4, 64], 'mul_50': ['s72', 4, 64], 'mul_51': ['s72', 4, 64], 'o2_7': ['s72', 4, 64], 'output_20': ['s72', 4, 128], 'cat_15': ['s72', 4, 128], 'key_10': ['s72', 512], 'output_21': ['s72', 2048], 'query_11': ['s72', 16, 128], 'output_22': ['s72', 16, 128], 'key_11': ['s72', 4, 128], 'value_3': ['s72', 4, 128], 'attn_output_3': ['s72', 2048], 'output_parallel_14': ['s72', 4096], 'output_23': ['s72', 4096], '_get_data_attr_7': [4096], 'x_37': ['s72', 4096], 'x_38': ['s72', 4096], 'residual_6': ['s72', 4096], 'pow_8': ['s72', 4096], 'variance_7': ['s72', 1], 'add_23': ['s72', 1], 'rsqrt_7': ['s72', 1], 'x_39': ['s72', 4096], 'x_40': ['s72', 4096], 'x_41': ['s72', 4096], 'output_parallel_15': ['s72', 14336], 'getitem_110': ['s72', 7168], 'silu_3': ['s72', 7168], 'getitem_111': ['s72', 7168], 'x_42': ['s72', 7168], 'output_parallel_16': ['s72', 4096], 'output_24': ['s72', 4096], '_get_data_attr_8': [4096], 'x_43': ['s72', 4096], 'x_44': ['s72', 4096], 'residual_7': ['s72', 4096], 'pow_9': ['s72', 4096], 'variance_8': ['s72', 1], 'add_25': ['s72', 1], 'rsqrt_8': ['s72', 1], 'x_45': ['s72', 4096], 'x_46': ['s72', 4096], 'x_47': ['s72', 4096], 'output_parallel_17': ['s72', 3072], 'q_4': ['s72', 2048], 'k_4': ['s72', 512], 'v_4': ['s72', 512], 'positions_4': ['s72'], 'cos_sin_4': ['s72', 128], 'cos_12': ['s72', 64], 'sin_12': ['s72', 64], 'query_12': ['s72', 16, 128], 'query_rot_4': ['s72', 16, 128], 'query_pass_4': ['s72', 16, 0], 'unsqueeze_17': ['s72', 1, 64], 'cos_13': ['s72', 1, 64], 'unsqueeze_18': ['s72', 1, 64], 'sin_13': ['s72', 1, 64], 'x1_8': ['s72', 16, 64], 'x2_8': ['s72', 16, 64], 'mul_57': ['s72', 16, 64], 'mul_58': ['s72', 16, 64], 'o1_8': ['s72', 16, 64], 'mul_59': ['s72', 16, 64], 'mul_60': ['s72', 16, 64], 'o2_8': ['s72', 16, 64], 'output_25': ['s72', 16, 128], 'cat_17': ['s72', 16, 128], 'query_13': ['s72', 2048], 'key_12': ['s72', 4, 128], 'key_rot_4': ['s72', 4, 128], 'key_pass_4': ['s72', 4, 0], 'unsqueeze_19': ['s72', 1, 64], 'cos_14': ['s72', 1, 64], 'unsqueeze_20': ['s72', 1, 64], 'sin_14': ['s72', 1, 64], 'x1_9': ['s72', 4, 64], 'x2_9': ['s72', 4, 64], 'mul_61': ['s72', 4, 64], 'mul_62': ['s72', 4, 64], 'o1_9': ['s72', 4, 64], 'mul_63': ['s72', 4, 64], 'mul_64': ['s72', 4, 64], 'o2_9': ['s72', 4, 64], 'output_26': ['s72', 4, 128], 'cat_19': ['s72', 4, 128], 'key_13': ['s72', 512], 'output_27': ['s72', 2048], 'query_14': ['s72', 16, 128], 'output_28': ['s72', 16, 128], 'key_14': ['s72', 4, 128], 'value_4': ['s72', 4, 128], 'attn_output_4': ['s72', 2048], 'output_parallel_18': ['s72', 4096], 'output_29': ['s72', 4096], '_get_data_attr_9': [4096], 'x_48': ['s72', 4096], 'x_49': ['s72', 4096], 'residual_8': ['s72', 4096], 'pow_10': ['s72', 4096], 'variance_9': ['s72', 1], 'add_29': ['s72', 1], 'rsqrt_9': ['s72', 1], 'x_50': ['s72', 4096], 'x_51': ['s72', 4096], 'x_52': ['s72', 4096], 'output_parallel_19': ['s72', 14336], 'getitem_138': ['s72', 7168], 'silu_4': ['s72', 7168], 'getitem_139': ['s72', 7168], 'x_53': ['s72', 7168], 'output_parallel_20': ['s72', 4096], 'output_30': ['s72', 4096], '_get_data_attr_10': [4096], 'x_54': ['s72', 4096], 'x_55': ['s72', 4096], 'residual_9': ['s72', 4096], 'pow_11': ['s72', 4096], 'variance_10': ['s72', 1], 'add_31': ['s72', 1], 'rsqrt_10': ['s72', 1], 'x_56': ['s72', 4096], 'x_57': ['s72', 4096], 'x_58': ['s72', 4096], 'output_parallel_21': ['s72', 3072], 'q_5': ['s72', 2048], 'k_5': ['s72', 512], 'v_5': ['s72', 512], 'positions_5': ['s72'], 'cos_sin_5': ['s72', 128], 'cos_15': ['s72', 64], 'sin_15': ['s72', 64], 'query_15': ['s72', 16, 128], 'query_rot_5': ['s72', 16, 128], 'query_pass_5': ['s72', 16, 0], 'unsqueeze_21': ['s72', 1, 64], 'cos_16': ['s72', 1, 64], 'unsqueeze_22': ['s72', 1, 64], 'sin_16': ['s72', 1, 64], 'x1_10': ['s72', 16, 64], 'x2_10': ['s72', 16, 64], 'mul_70': ['s72', 16, 64], 'mul_71': ['s72', 16, 64], 'o1_10': ['s72', 16, 64], 'mul_72': ['s72', 16, 64], 'mul_73': ['s72', 16, 64], 'o2_10': ['s72', 16, 64], 'output_31': ['s72', 16, 128], 'cat_21': ['s72', 16, 128], 'query_16': ['s72', 2048], 'key_15': ['s72', 4, 128], 'key_rot_5': ['s72', 4, 128], 'key_pass_5': ['s72', 4, 0], 'unsqueeze_23': ['s72', 1, 64], 'cos_17': ['s72', 1, 64], 'unsqueeze_24': ['s72', 1, 64], 'sin_17': ['s72', 1, 64], 'x1_11': ['s72', 4, 64], 'x2_11': ['s72', 4, 64], 'mul_74': ['s72', 4, 64], 'mul_75': ['s72', 4, 64], 'o1_11': ['s72', 4, 64], 'mul_76': ['s72', 4, 64], 'mul_77': ['s72', 4, 64], 'o2_11': ['s72', 4, 64], 'output_32': ['s72', 4, 128], 'cat_23': ['s72', 4, 128], 'key_16': ['s72', 512], 'output_33': ['s72', 2048], 'query_17': ['s72', 16, 128], 'output_34': ['s72', 16, 128], 'key_17': ['s72', 4, 128], 'value_5': ['s72', 4, 128], 'attn_output_5': ['s72', 2048], 'output_parallel_22': ['s72', 4096], 'output_35': ['s72', 4096], '_get_data_attr_11': [4096], 'x_59': ['s72', 4096], 'x_60': ['s72', 4096], 'residual_10': ['s72', 4096], 'pow_12': ['s72', 4096], 'variance_11': ['s72', 1], 'add_35': ['s72', 1], 'rsqrt_11': ['s72', 1], 'x_61': ['s72', 4096], 'x_62': ['s72', 4096], 'x_63': ['s72', 4096], 'output_parallel_23': ['s72', 14336], 'getitem_166': ['s72', 7168], 'silu_5': ['s72', 7168], 'getitem_167': ['s72', 7168], 'x_64': ['s72', 7168], 'output_parallel_24': ['s72', 4096], 'output_36': ['s72', 4096], '_get_data_attr_12': [4096], 'x_65': ['s72', 4096], 'x_66': ['s72', 4096], 'residual_11': ['s72', 4096], 'pow_13': ['s72', 4096], 'variance_12': ['s72', 1], 'add_37': ['s72', 1], 'rsqrt_12': ['s72', 1], 'x_67': ['s72', 4096], 'x_68': ['s72', 4096], 'x_69': ['s72', 4096], 'output_parallel_25': ['s72', 3072], 'q_6': ['s72', 2048], 'k_6': ['s72', 512], 'v_6': ['s72', 512], 'positions_6': ['s72'], 'cos_sin_6': ['s72', 128], 'cos_18': ['s72', 64], 'sin_18': ['s72', 64], 'query_18': ['s72', 16, 128], 'query_rot_6': ['s72', 16, 128], 'query_pass_6': ['s72', 16, 0], 'unsqueeze_25': ['s72', 1, 64], 'cos_19': ['s72', 1, 64], 'unsqueeze_26': ['s72', 1, 64], 'sin_19': ['s72', 1, 64], 'x1_12': ['s72', 16, 64], 'x2_12': ['s72', 16, 64], 'mul_83': ['s72', 16, 64], 'mul_84': ['s72', 16, 64], 'o1_12': ['s72', 16, 64], 'mul_85': ['s72', 16, 64], 'mul_86': ['s72', 16, 64], 'o2_12': ['s72', 16, 64], 'output_37': ['s72', 16, 128], 'cat_25': ['s72', 16, 128], 'query_19': ['s72', 2048], 'key_18': ['s72', 4, 128], 'key_rot_6': ['s72', 4, 128], 'key_pass_6': ['s72', 4, 0], 'unsqueeze_27': ['s72', 1, 64], 'cos_20': ['s72', 1, 64], 'unsqueeze_28': ['s72', 1, 64], 'sin_20': ['s72', 1, 64], 'x1_13': ['s72', 4, 64], 'x2_13': ['s72', 4, 64], 'mul_87': ['s72', 4, 64], 'mul_88': ['s72', 4, 64], 'o1_13': ['s72', 4, 64], 'mul_89': ['s72', 4, 64], 'mul_90': ['s72', 4, 64], 'o2_13': ['s72', 4, 64], 'output_38': ['s72', 4, 128], 'cat_27': ['s72', 4, 128], 'key_19': ['s72', 512], 'output_39': ['s72', 2048], 'query_20': ['s72', 16, 128], 'output_40': ['s72', 16, 128], 'key_20': ['s72', 4, 128], 'value_6': ['s72', 4, 128], 'attn_output_6': ['s72', 2048], 'output_parallel_26': ['s72', 4096], 'output_41': ['s72', 4096], '_get_data_attr_13': [4096], 'x_70': ['s72', 4096], 'x_71': ['s72', 4096], 'residual_12': ['s72', 4096], 'pow_14': ['s72', 4096], 'variance_13': ['s72', 1], 'add_41': ['s72', 1], 'rsqrt_13': ['s72', 1], 'x_72': ['s72', 4096], 'x_73': ['s72', 4096], 'x_74': ['s72', 4096], 'output_parallel_27': ['s72', 14336], 'getitem_194': ['s72', 7168], 'silu_6': ['s72', 7168], 'getitem_195': ['s72', 7168], 'x_75': ['s72', 7168], 'output_parallel_28': ['s72', 4096], 'output_42': ['s72', 4096], '_get_data_attr_14': [4096], 'x_76': ['s72', 4096], 'x_77': ['s72', 4096], 'residual_13': ['s72', 4096], 'pow_15': ['s72', 4096], 'variance_14': ['s72', 1], 'add_43': ['s72', 1], 'rsqrt_14': ['s72', 1], 'x_78': ['s72', 4096], 'x_79': ['s72', 4096], 'x_80': ['s72', 4096], 'output_parallel_29': ['s72', 3072], 'q_7': ['s72', 2048], 'k_7': ['s72', 512], 'v_7': ['s72', 512], 'positions_7': ['s72'], 'cos_sin_7': ['s72', 128], 'cos_21': ['s72', 64], 'sin_21': ['s72', 64], 'query_21': ['s72', 16, 128], 'query_rot_7': ['s72', 16, 128], 'query_pass_7': ['s72', 16, 0], 'unsqueeze_29': ['s72', 1, 64], 'cos_22': ['s72', 1, 64], 'unsqueeze_30': ['s72', 1, 64], 'sin_22': ['s72', 1, 64], 'x1_14': ['s72', 16, 64], 'x2_14': ['s72', 16, 64], 'mul_96': ['s72', 16, 64], 'mul_97': ['s72', 16, 64], 'o1_14': ['s72', 16, 64], 'mul_98': ['s72', 16, 64], 'mul_99': ['s72', 16, 64], 'o2_14': ['s72', 16, 64], 'output_43': ['s72', 16, 128], 'cat_29': ['s72', 16, 128], 'query_22': ['s72', 2048], 'key_21': ['s72', 4, 128], 'key_rot_7': ['s72', 4, 128], 'key_pass_7': ['s72', 4, 0], 'unsqueeze_31': ['s72', 1, 64], 'cos_23': ['s72', 1, 64], 'unsqueeze_32': ['s72', 1, 64], 'sin_23': ['s72', 1, 64], 'x1_15': ['s72', 4, 64], 'x2_15': ['s72', 4, 64], 'mul_100': ['s72', 4, 64], 'mul_101': ['s72', 4, 64], 'o1_15': ['s72', 4, 64], 'mul_102': ['s72', 4, 64], 'mul_103': ['s72', 4, 64], 'o2_15': ['s72', 4, 64], 'output_44': ['s72', 4, 128], 'cat_31': ['s72', 4, 128], 'key_22': ['s72', 512], 'output_45': ['s72', 2048], 'query_23': ['s72', 16, 128], 'output_46': ['s72', 16, 128], 'key_23': ['s72', 4, 128], 'value_7': ['s72', 4, 128], 'attn_output_7': ['s72', 2048], 'output_parallel_30': ['s72', 4096], 'output_47': ['s72', 4096], '_get_data_attr_15': [4096], 'x_81': ['s72', 4096], 'x_82': ['s72', 4096], 'residual_14': ['s72', 4096], 'pow_16': ['s72', 4096], 'variance_15': ['s72', 1], 'add_47': ['s72', 1], 'rsqrt_15': ['s72', 1], 'x_83': ['s72', 4096], 'x_84': ['s72', 4096], 'x_85': ['s72', 4096], 'output_parallel_31': ['s72', 14336], 'getitem_222': ['s72', 7168], 'silu_7': ['s72', 7168], 'getitem_223': ['s72', 7168], 'x_86': ['s72', 7168], 'output_parallel_32': ['s72', 4096], 'output_48': ['s72', 4096], '_get_data_attr_16': [4096], 'x_87': ['s72', 4096], 'x_88': ['s72', 4096], 'residual_15': ['s72', 4096], 'pow_17': ['s72', 4096], 'variance_16': ['s72', 1], 'add_49': ['s72', 1], 'rsqrt_16': ['s72', 1], 'x_89': ['s72', 4096], 'x_90': ['s72', 4096], 'x_91': ['s72', 4096], 'output_parallel_33': ['s72', 3072], 'q_8': ['s72', 2048], 'k_8': ['s72', 512], 'v_8': ['s72', 512], 'positions_8': ['s72'], 'cos_sin_8': ['s72', 128], 'cos_24': ['s72', 64], 'sin_24': ['s72', 64], 'query_24': ['s72', 16, 128], 'query_rot_8': ['s72', 16, 128], 'query_pass_8': ['s72', 16, 0], 'unsqueeze_33': ['s72', 1, 64], 'cos_25': ['s72', 1, 64], 'unsqueeze_34': ['s72', 1, 64], 'sin_25': ['s72', 1, 64], 'x1_16': ['s72', 16, 64], 'x2_16': ['s72', 16, 64], 'mul_109': ['s72', 16, 64], 'mul_110': ['s72', 16, 64], 'o1_16': ['s72', 16, 64], 'mul_111': ['s72', 16, 64], 'mul_112': ['s72', 16, 64], 'o2_16': ['s72', 16, 64], 'output_49': ['s72', 16, 128], 'cat_33': ['s72', 16, 128], 'query_25': ['s72', 2048], 'key_24': ['s72', 4, 128], 'key_rot_8': ['s72', 4, 128], 'key_pass_8': ['s72', 4, 0], 'unsqueeze_35': ['s72', 1, 64], 'cos_26': ['s72', 1, 64], 'unsqueeze_36': ['s72', 1, 64], 'sin_26': ['s72', 1, 64], 'x1_17': ['s72', 4, 64], 'x2_17': ['s72', 4, 64], 'mul_113': ['s72', 4, 64], 'mul_114': ['s72', 4, 64], 'o1_17': ['s72', 4, 64], 'mul_115': ['s72', 4, 64], 'mul_116': ['s72', 4, 64], 'o2_17': ['s72', 4, 64], 'output_50': ['s72', 4, 128], 'cat_35': ['s72', 4, 128], 'key_25': ['s72', 512], 'output_51': ['s72', 2048], 'query_26': ['s72', 16, 128], 'output_52': ['s72', 16, 128], 'key_26': ['s72', 4, 128], 'value_8': ['s72', 4, 128], 'attn_output_8': ['s72', 2048], 'output_parallel_34': ['s72', 4096], 'output_53': ['s72', 4096], '_get_data_attr_17': [4096], 'x_92': ['s72', 4096], 'x_93': ['s72', 4096], 'residual_16': ['s72', 4096], 'pow_18': ['s72', 4096], 'variance_17': ['s72', 1], 'add_53': ['s72', 1], 'rsqrt_17': ['s72', 1], 'x_94': ['s72', 4096], 'x_95': ['s72', 4096], 'x_96': ['s72', 4096], 'output_parallel_35': ['s72', 14336], 'getitem_250': ['s72', 7168], 'silu_8': ['s72', 7168], 'getitem_251': ['s72', 7168], 'x_97': ['s72', 7168], 'output_parallel_36': ['s72', 4096], 'output_54': ['s72', 4096], '_get_data_attr_18': [4096], 'x_98': ['s72', 4096], 'x_99': ['s72', 4096], 'residual_17': ['s72', 4096], 'pow_19': ['s72', 4096], 'variance_18': ['s72', 1], 'add_55': ['s72', 1], 'rsqrt_18': ['s72', 1], 'x_100': ['s72', 4096], 'x_101': ['s72', 4096], 'x_102': ['s72', 4096], 'output_parallel_37': ['s72', 3072], 'q_9': ['s72', 2048], 'k_9': ['s72', 512], 'v_9': ['s72', 512], 'positions_9': ['s72'], 'cos_sin_9': ['s72', 128], 'cos_27': ['s72', 64], 'sin_27': ['s72', 64], 'query_27': ['s72', 16, 128], 'query_rot_9': ['s72', 16, 128], 'query_pass_9': ['s72', 16, 0], 'unsqueeze_37': ['s72', 1, 64], 'cos_28': ['s72', 1, 64], 'unsqueeze_38': ['s72', 1, 64], 'sin_28': ['s72', 1, 64], 'x1_18': ['s72', 16, 64], 'x2_18': ['s72', 16, 64], 'mul_122': ['s72', 16, 64], 'mul_123': ['s72', 16, 64], 'o1_18': ['s72', 16, 64], 'mul_124': ['s72', 16, 64], 'mul_125': ['s72', 16, 64], 'o2_18': ['s72', 16, 64], 'output_55': ['s72', 16, 128], 'cat_37': ['s72', 16, 128], 'query_28': ['s72', 2048], 'key_27': ['s72', 4, 128], 'key_rot_9': ['s72', 4, 128], 'key_pass_9': ['s72', 4, 0], 'unsqueeze_39': ['s72', 1, 64], 'cos_29': ['s72', 1, 64], 'unsqueeze_40': ['s72', 1, 64], 'sin_29': ['s72', 1, 64], 'x1_19': ['s72', 4, 64], 'x2_19': ['s72', 4, 64], 'mul_126': ['s72', 4, 64], 'mul_127': ['s72', 4, 64], 'o1_19': ['s72', 4, 64], 'mul_128': ['s72', 4, 64], 'mul_129': ['s72', 4, 64], 'o2_19': ['s72', 4, 64], 'output_56': ['s72', 4, 128], 'cat_39': ['s72', 4, 128], 'key_28': ['s72', 512], 'output_57': ['s72', 2048], 'query_29': ['s72', 16, 128], 'output_58': ['s72', 16, 128], 'key_29': ['s72', 4, 128], 'value_9': ['s72', 4, 128], 'attn_output_9': ['s72', 2048], 'output_parallel_38': ['s72', 4096], 'output_59': ['s72', 4096], '_get_data_attr_19': [4096], 'x_103': ['s72', 4096], 'x_104': ['s72', 4096], 'residual_18': ['s72', 4096], 'pow_20': ['s72', 4096], 'variance_19': ['s72', 1], 'add_59': ['s72', 1], 'rsqrt_19': ['s72', 1], 'x_105': ['s72', 4096], 'x_106': ['s72', 4096], 'x_107': ['s72', 4096], 'output_parallel_39': ['s72', 14336], 'getitem_278': ['s72', 7168], 'silu_9': ['s72', 7168], 'getitem_279': ['s72', 7168], 'x_108': ['s72', 7168], 'output_parallel_40': ['s72', 4096], 'output_60': ['s72', 4096], '_get_data_attr_20': [4096], 'x_109': ['s72', 4096], 'x_110': ['s72', 4096], 'residual_19': ['s72', 4096], 'pow_21': ['s72', 4096], 'variance_20': ['s72', 1], 'add_61': ['s72', 1], 'rsqrt_20': ['s72', 1], 'x_111': ['s72', 4096], 'x_112': ['s72', 4096], 'x_113': ['s72', 4096], 'output_parallel_41': ['s72', 3072], 'q_10': ['s72', 2048], 'k_10': ['s72', 512], 'v_10': ['s72', 512], 'positions_10': ['s72'], 'cos_sin_10': ['s72', 128], 'cos_30': ['s72', 64], 'sin_30': ['s72', 64], 'query_30': ['s72', 16, 128], 'query_rot_10': ['s72', 16, 128], 'query_pass_10': ['s72', 16, 0], 'unsqueeze_41': ['s72', 1, 64], 'cos_31': ['s72', 1, 64], 'unsqueeze_42': ['s72', 1, 64], 'sin_31': ['s72', 1, 64], 'x1_20': ['s72', 16, 64], 'x2_20': ['s72', 16, 64], 'mul_135': ['s72', 16, 64], 'mul_136': ['s72', 16, 64], 'o1_20': ['s72', 16, 64], 'mul_137': ['s72', 16, 64], 'mul_138': ['s72', 16, 64], 'o2_20': ['s72', 16, 64], 'output_61': ['s72', 16, 128], 'cat_41': ['s72', 16, 128], 'query_31': ['s72', 2048], 'key_30': ['s72', 4, 128], 'key_rot_10': ['s72', 4, 128], 'key_pass_10': ['s72', 4, 0], 'unsqueeze_43': ['s72', 1, 64], 'cos_32': ['s72', 1, 64], 'unsqueeze_44': ['s72', 1, 64], 'sin_32': ['s72', 1, 64], 'x1_21': ['s72', 4, 64], 'x2_21': ['s72', 4, 64], 'mul_139': ['s72', 4, 64], 'mul_140': ['s72', 4, 64], 'o1_21': ['s72', 4, 64], 'mul_141': ['s72', 4, 64], 'mul_142': ['s72', 4, 64], 'o2_21': ['s72', 4, 64], 'output_62': ['s72', 4, 128], 'cat_43': ['s72', 4, 128], 'key_31': ['s72', 512], 'output_63': ['s72', 2048], 'query_32': ['s72', 16, 128], 'output_64': ['s72', 16, 128], 'key_32': ['s72', 4, 128], 'value_10': ['s72', 4, 128], 'attn_output_10': ['s72', 2048], 'output_parallel_42': ['s72', 4096], 'output_65': ['s72', 4096], '_get_data_attr_21': [4096], 'x_114': ['s72', 4096], 'x_115': ['s72', 4096], 'residual_20': ['s72', 4096], 'pow_22': ['s72', 4096], 'variance_21': ['s72', 1], 'add_65': ['s72', 1], 'rsqrt_21': ['s72', 1], 'x_116': ['s72', 4096], 'x_117': ['s72', 4096], 'x_118': ['s72', 4096], 'output_parallel_43': ['s72', 14336], 'getitem_306': ['s72', 7168], 'silu_10': ['s72', 7168], 'getitem_307': ['s72', 7168], 'x_119': ['s72', 7168], 'output_parallel_44': ['s72', 4096], 'output_66': ['s72', 4096], '_get_data_attr_22': [4096], 'x_120': ['s72', 4096], 'x_121': ['s72', 4096], 'residual_21': ['s72', 4096], 'pow_23': ['s72', 4096], 'variance_22': ['s72', 1], 'add_67': ['s72', 1], 'rsqrt_22': ['s72', 1], 'x_122': ['s72', 4096], 'x_123': ['s72', 4096], 'x_124': ['s72', 4096], 'output_parallel_45': ['s72', 3072], 'q_11': ['s72', 2048], 'k_11': ['s72', 512], 'v_11': ['s72', 512], 'positions_11': ['s72'], 'cos_sin_11': ['s72', 128], 'cos_33': ['s72', 64], 'sin_33': ['s72', 64], 'query_33': ['s72', 16, 128], 'query_rot_11': ['s72', 16, 128], 'query_pass_11': ['s72', 16, 0], 'unsqueeze_45': ['s72', 1, 64], 'cos_34': ['s72', 1, 64], 'unsqueeze_46': ['s72', 1, 64], 'sin_34': ['s72', 1, 64], 'x1_22': ['s72', 16, 64], 'x2_22': ['s72', 16, 64], 'mul_148': ['s72', 16, 64], 'mul_149': ['s72', 16, 64], 'o1_22': ['s72', 16, 64], 'mul_150': ['s72', 16, 64], 'mul_151': ['s72', 16, 64], 'o2_22': ['s72', 16, 64], 'output_67': ['s72', 16, 128], 'cat_45': ['s72', 16, 128], 'query_34': ['s72', 2048], 'key_33': ['s72', 4, 128], 'key_rot_11': ['s72', 4, 128], 'key_pass_11': ['s72', 4, 0], 'unsqueeze_47': ['s72', 1, 64], 'cos_35': ['s72', 1, 64], 'unsqueeze_48': ['s72', 1, 64], 'sin_35': ['s72', 1, 64], 'x1_23': ['s72', 4, 64], 'x2_23': ['s72', 4, 64], 'mul_152': ['s72', 4, 64], 'mul_153': ['s72', 4, 64], 'o1_23': ['s72', 4, 64], 'mul_154': ['s72', 4, 64], 'mul_155': ['s72', 4, 64], 'o2_23': ['s72', 4, 64], 'output_68': ['s72', 4, 128], 'cat_47': ['s72', 4, 128], 'key_34': ['s72', 512], 'output_69': ['s72', 2048], 'query_35': ['s72', 16, 128], 'output_70': ['s72', 16, 128], 'key_35': ['s72', 4, 128], 'value_11': ['s72', 4, 128], 'attn_output_11': ['s72', 2048], 'output_parallel_46': ['s72', 4096], 'output_71': ['s72', 4096], '_get_data_attr_23': [4096], 'x_125': ['s72', 4096], 'x_126': ['s72', 4096], 'residual_22': ['s72', 4096], 'pow_24': ['s72', 4096], 'variance_23': ['s72', 1], 'add_71': ['s72', 1], 'rsqrt_23': ['s72', 1], 'x_127': ['s72', 4096], 'x_128': ['s72', 4096], 'x_129': ['s72', 4096], 'output_parallel_47': ['s72', 14336], 'getitem_334': ['s72', 7168], 'silu_11': ['s72', 7168], 'getitem_335': ['s72', 7168], 'x_130': ['s72', 7168], 'output_parallel_48': ['s72', 4096], 'output_72': ['s72', 4096], '_get_data_attr_24': [4096], 'x_131': ['s72', 4096], 'x_132': ['s72', 4096], 'residual_23': ['s72', 4096], 'pow_25': ['s72', 4096], 'variance_24': ['s72', 1], 'add_73': ['s72', 1], 'rsqrt_24': ['s72', 1], 'x_133': ['s72', 4096], 'x_134': ['s72', 4096], 'x_135': ['s72', 4096], 'output_parallel_49': ['s72', 3072], 'q_12': ['s72', 2048], 'k_12': ['s72', 512], 'v_12': ['s72', 512], 'positions_12': ['s72'], 'cos_sin_12': ['s72', 128], 'cos_36': ['s72', 64], 'sin_36': ['s72', 64], 'query_36': ['s72', 16, 128], 'query_rot_12': ['s72', 16, 128], 'query_pass_12': ['s72', 16, 0], 'unsqueeze_49': ['s72', 1, 64], 'cos_37': ['s72', 1, 64], 'unsqueeze_50': ['s72', 1, 64], 'sin_37': ['s72', 1, 64], 'x1_24': ['s72', 16, 64], 'x2_24': ['s72', 16, 64], 'mul_161': ['s72', 16, 64], 'mul_162': ['s72', 16, 64], 'o1_24': ['s72', 16, 64], 'mul_163': ['s72', 16, 64], 'mul_164': ['s72', 16, 64], 'o2_24': ['s72', 16, 64], 'output_73': ['s72', 16, 128], 'cat_49': ['s72', 16, 128], 'query_37': ['s72', 2048], 'key_36': ['s72', 4, 128], 'key_rot_12': ['s72', 4, 128], 'key_pass_12': ['s72', 4, 0], 'unsqueeze_51': ['s72', 1, 64], 'cos_38': ['s72', 1, 64], 'unsqueeze_52': ['s72', 1, 64], 'sin_38': ['s72', 1, 64], 'x1_25': ['s72', 4, 64], 'x2_25': ['s72', 4, 64], 'mul_165': ['s72', 4, 64], 'mul_166': ['s72', 4, 64], 'o1_25': ['s72', 4, 64], 'mul_167': ['s72', 4, 64], 'mul_168': ['s72', 4, 64], 'o2_25': ['s72', 4, 64], 'output_74': ['s72', 4, 128], 'cat_51': ['s72', 4, 128], 'key_37': ['s72', 512], 'output_75': ['s72', 2048], 'query_38': ['s72', 16, 128], 'output_76': ['s72', 16, 128], 'key_38': ['s72', 4, 128], 'value_12': ['s72', 4, 128], 'attn_output_12': ['s72', 2048], 'output_parallel_50': ['s72', 4096], 'output_77': ['s72', 4096], '_get_data_attr_25': [4096], 'x_136': ['s72', 4096], 'x_137': ['s72', 4096], 'residual_24': ['s72', 4096], 'pow_26': ['s72', 4096], 'variance_25': ['s72', 1], 'add_77': ['s72', 1], 'rsqrt_25': ['s72', 1], 'x_138': ['s72', 4096], 'x_139': ['s72', 4096], 'x_140': ['s72', 4096], 'output_parallel_51': ['s72', 14336], 'getitem_362': ['s72', 7168], 'silu_12': ['s72', 7168], 'getitem_363': ['s72', 7168], 'x_141': ['s72', 7168], 'output_parallel_52': ['s72', 4096], 'output_78': ['s72', 4096], '_get_data_attr_26': [4096], 'x_142': ['s72', 4096], 'x_143': ['s72', 4096], 'residual_25': ['s72', 4096], 'pow_27': ['s72', 4096], 'variance_26': ['s72', 1], 'add_79': ['s72', 1], 'rsqrt_26': ['s72', 1], 'x_144': ['s72', 4096], 'x_145': ['s72', 4096], 'x_146': ['s72', 4096], 'output_parallel_53': ['s72', 3072], 'q_13': ['s72', 2048], 'k_13': ['s72', 512], 'v_13': ['s72', 512], 'positions_13': ['s72'], 'cos_sin_13': ['s72', 128], 'cos_39': ['s72', 64], 'sin_39': ['s72', 64], 'query_39': ['s72', 16, 128], 'query_rot_13': ['s72', 16, 128], 'query_pass_13': ['s72', 16, 0], 'unsqueeze_53': ['s72', 1, 64], 'cos_40': ['s72', 1, 64], 'unsqueeze_54': ['s72', 1, 64], 'sin_40': ['s72', 1, 64], 'x1_26': ['s72', 16, 64], 'x2_26': ['s72', 16, 64], 'mul_174': ['s72', 16, 64], 'mul_175': ['s72', 16, 64], 'o1_26': ['s72', 16, 64], 'mul_176': ['s72', 16, 64], 'mul_177': ['s72', 16, 64], 'o2_26': ['s72', 16, 64], 'output_79': ['s72', 16, 128], 'cat_53': ['s72', 16, 128], 'query_40': ['s72', 2048], 'key_39': ['s72', 4, 128], 'key_rot_13': ['s72', 4, 128], 'key_pass_13': ['s72', 4, 0], 'unsqueeze_55': ['s72', 1, 64], 'cos_41': ['s72', 1, 64], 'unsqueeze_56': ['s72', 1, 64], 'sin_41': ['s72', 1, 64], 'x1_27': ['s72', 4, 64], 'x2_27': ['s72', 4, 64], 'mul_178': ['s72', 4, 64], 'mul_179': ['s72', 4, 64], 'o1_27': ['s72', 4, 64], 'mul_180': ['s72', 4, 64], 'mul_181': ['s72', 4, 64], 'o2_27': ['s72', 4, 64], 'output_80': ['s72', 4, 128], 'cat_55': ['s72', 4, 128], 'key_40': ['s72', 512], 'output_81': ['s72', 2048], 'query_41': ['s72', 16, 128], 'output_82': ['s72', 16, 128], 'key_41': ['s72', 4, 128], 'value_13': ['s72', 4, 128], 'attn_output_13': ['s72', 2048], 'output_parallel_54': ['s72', 4096], 'output_83': ['s72', 4096], '_get_data_attr_27': [4096], 'x_147': ['s72', 4096], 'x_148': ['s72', 4096], 'residual_26': ['s72', 4096], 'pow_28': ['s72', 4096], 'variance_27': ['s72', 1], 'add_83': ['s72', 1], 'rsqrt_27': ['s72', 1], 'x_149': ['s72', 4096], 'x_150': ['s72', 4096], 'x_151': ['s72', 4096], 'output_parallel_55': ['s72', 14336], 'getitem_390': ['s72', 7168], 'silu_13': ['s72', 7168], 'getitem_391': ['s72', 7168], 'x_152': ['s72', 7168], 'output_parallel_56': ['s72', 4096], 'output_84': ['s72', 4096], '_get_data_attr_28': [4096], 'x_153': ['s72', 4096], 'x_154': ['s72', 4096], 'residual_27': ['s72', 4096], 'pow_29': ['s72', 4096], 'variance_28': ['s72', 1], 'add_85': ['s72', 1], 'rsqrt_28': ['s72', 1], 'x_155': ['s72', 4096], 'x_156': ['s72', 4096], 'x_157': ['s72', 4096], 'output_parallel_57': ['s72', 3072], 'q_14': ['s72', 2048], 'k_14': ['s72', 512], 'v_14': ['s72', 512], 'positions_14': ['s72'], 'cos_sin_14': ['s72', 128], 'cos_42': ['s72', 64], 'sin_42': ['s72', 64], 'query_42': ['s72', 16, 128], 'query_rot_14': ['s72', 16, 128], 'query_pass_14': ['s72', 16, 0], 'unsqueeze_57': ['s72', 1, 64], 'cos_43': ['s72', 1, 64], 'unsqueeze_58': ['s72', 1, 64], 'sin_43': ['s72', 1, 64], 'x1_28': ['s72', 16, 64], 'x2_28': ['s72', 16, 64], 'mul_187': ['s72', 16, 64], 'mul_188': ['s72', 16, 64], 'o1_28': ['s72', 16, 64], 'mul_189': ['s72', 16, 64], 'mul_190': ['s72', 16, 64], 'o2_28': ['s72', 16, 64], 'output_85': ['s72', 16, 128], 'cat_57': ['s72', 16, 128], 'query_43': ['s72', 2048], 'key_42': ['s72', 4, 128], 'key_rot_14': ['s72', 4, 128], 'key_pass_14': ['s72', 4, 0], 'unsqueeze_59': ['s72', 1, 64], 'cos_44': ['s72', 1, 64], 'unsqueeze_60': ['s72', 1, 64], 'sin_44': ['s72', 1, 64], 'x1_29': ['s72', 4, 64], 'x2_29': ['s72', 4, 64], 'mul_191': ['s72', 4, 64], 'mul_192': ['s72', 4, 64], 'o1_29': ['s72', 4, 64], 'mul_193': ['s72', 4, 64], 'mul_194': ['s72', 4, 64], 'o2_29': ['s72', 4, 64], 'output_86': ['s72', 4, 128], 'cat_59': ['s72', 4, 128], 'key_43': ['s72', 512], 'output_87': ['s72', 2048], 'query_44': ['s72', 16, 128], 'output_88': ['s72', 16, 128], 'key_44': ['s72', 4, 128], 'value_14': ['s72', 4, 128], 'attn_output_14': ['s72', 2048], 'output_parallel_58': ['s72', 4096], 'output_89': ['s72', 4096], '_get_data_attr_29': [4096], 'x_158': ['s72', 4096], 'x_159': ['s72', 4096], 'residual_28': ['s72', 4096], 'pow_30': ['s72', 4096], 'variance_29': ['s72', 1], 'add_89': ['s72', 1], 'rsqrt_29': ['s72', 1], 'x_160': ['s72', 4096], 'x_161': ['s72', 4096], 'x_162': ['s72', 4096], 'output_parallel_59': ['s72', 14336], 'getitem_418': ['s72', 7168], 'silu_14': ['s72', 7168], 'getitem_419': ['s72', 7168], 'x_163': ['s72', 7168], 'output_parallel_60': ['s72', 4096], 'output_90': ['s72', 4096], '_get_data_attr_30': [4096], 'x_164': ['s72', 4096], 'x_165': ['s72', 4096], 'residual_29': ['s72', 4096], 'pow_31': ['s72', 4096], 'variance_30': ['s72', 1], 'add_91': ['s72', 1], 'rsqrt_30': ['s72', 1], 'x_166': ['s72', 4096], 'x_167': ['s72', 4096], 'x_168': ['s72', 4096], 'output_parallel_61': ['s72', 3072], 'q_15': ['s72', 2048], 'k_15': ['s72', 512], 'v_15': ['s72', 512], 'positions_15': ['s72'], 'cos_sin_15': ['s72', 128], 'cos_45': ['s72', 64], 'sin_45': ['s72', 64], 'query_45': ['s72', 16, 128], 'query_rot_15': ['s72', 16, 128], 'query_pass_15': ['s72', 16, 0], 'unsqueeze_61': ['s72', 1, 64], 'cos_46': ['s72', 1, 64], 'unsqueeze_62': ['s72', 1, 64], 'sin_46': ['s72', 1, 64], 'x1_30': ['s72', 16, 64], 'x2_30': ['s72', 16, 64], 'mul_200': ['s72', 16, 64], 'mul_201': ['s72', 16, 64], 'o1_30': ['s72', 16, 64], 'mul_202': ['s72', 16, 64], 'mul_203': ['s72', 16, 64], 'o2_30': ['s72', 16, 64], 'output_91': ['s72', 16, 128], 'cat_61': ['s72', 16, 128], 'query_46': ['s72', 2048], 'key_45': ['s72', 4, 128], 'key_rot_15': ['s72', 4, 128], 'key_pass_15': ['s72', 4, 0], 'unsqueeze_63': ['s72', 1, 64], 'cos_47': ['s72', 1, 64], 'unsqueeze_64': ['s72', 1, 64], 'sin_47': ['s72', 1, 64], 'x1_31': ['s72', 4, 64], 'x2_31': ['s72', 4, 64], 'mul_204': ['s72', 4, 64], 'mul_205': ['s72', 4, 64], 'o1_31': ['s72', 4, 64], 'mul_206': ['s72', 4, 64], 'mul_207': ['s72', 4, 64], 'o2_31': ['s72', 4, 64], 'output_92': ['s72', 4, 128], 'cat_63': ['s72', 4, 128], 'key_46': ['s72', 512], 'output_93': ['s72', 2048], 'query_47': ['s72', 16, 128], 'output_94': ['s72', 16, 128], 'key_47': ['s72', 4, 128], 'value_15': ['s72', 4, 128], 'attn_output_15': ['s72', 2048], 'output_parallel_62': ['s72', 4096], 'output_95': ['s72', 4096], '_get_data_attr_31': [4096], 'x_169': ['s72', 4096], 'x_170': ['s72', 4096], 'residual_30': ['s72', 4096], 'pow_32': ['s72', 4096], 'variance_31': ['s72', 1], 'add_95': ['s72', 1], 'rsqrt_31': ['s72', 1], 'x_171': ['s72', 4096], 'x_172': ['s72', 4096], 'x_173': ['s72', 4096], 'output_parallel_63': ['s72', 14336], 'getitem_446': ['s72', 7168], 'silu_15': ['s72', 7168], 'getitem_447': ['s72', 7168], 'x_174': ['s72', 7168], 'output_parallel_64': ['s72', 4096], 'output_96': ['s72', 4096], '_get_data_attr_32': [4096], 'x_175': ['s72', 4096], 'x_176': ['s72', 4096], 'residual_31': ['s72', 4096], 'pow_33': ['s72', 4096], 'variance_32': ['s72', 1], 'add_97': ['s72', 1], 'rsqrt_32': ['s72', 1], 'x_177': ['s72', 4096], 'x_178': ['s72', 4096], 'x_179': ['s72', 4096], 'output_parallel_65': ['s72', 3072], 'q_16': ['s72', 2048], 'k_16': ['s72', 512], 'v_16': ['s72', 512], 'positions_16': ['s72'], 'cos_sin_16': ['s72', 128], 'cos_48': ['s72', 64], 'sin_48': ['s72', 64], 'query_48': ['s72', 16, 128], 'query_rot_16': ['s72', 16, 128], 'query_pass_16': ['s72', 16, 0], 'unsqueeze_65': ['s72', 1, 64], 'cos_49': ['s72', 1, 64], 'unsqueeze_66': ['s72', 1, 64], 'sin_49': ['s72', 1, 64], 'x1_32': ['s72', 16, 64], 'x2_32': ['s72', 16, 64], 'mul_213': ['s72', 16, 64], 'mul_214': ['s72', 16, 64], 'o1_32': ['s72', 16, 64], 'mul_215': ['s72', 16, 64], 'mul_216': ['s72', 16, 64], 'o2_32': ['s72', 16, 64], 'output_97': ['s72', 16, 128], 'cat_65': ['s72', 16, 128], 'query_49': ['s72', 2048], 'key_48': ['s72', 4, 128], 'key_rot_16': ['s72', 4, 128], 'key_pass_16': ['s72', 4, 0], 'unsqueeze_67': ['s72', 1, 64], 'cos_50': ['s72', 1, 64], 'unsqueeze_68': ['s72', 1, 64], 'sin_50': ['s72', 1, 64], 'x1_33': ['s72', 4, 64], 'x2_33': ['s72', 4, 64], 'mul_217': ['s72', 4, 64], 'mul_218': ['s72', 4, 64], 'o1_33': ['s72', 4, 64], 'mul_219': ['s72', 4, 64], 'mul_220': ['s72', 4, 64], 'o2_33': ['s72', 4, 64], 'output_98': ['s72', 4, 128], 'cat_67': ['s72', 4, 128], 'key_49': ['s72', 512], 'output_99': ['s72', 2048], 'query_50': ['s72', 16, 128], 'output_100': ['s72', 16, 128], 'key_50': ['s72', 4, 128], 'value_16': ['s72', 4, 128], 'attn_output_16': ['s72', 2048], 'output_parallel_66': ['s72', 4096], 'output_101': ['s72', 4096], '_get_data_attr_33': [4096], 'x_180': ['s72', 4096], 'x_181': ['s72', 4096], 'residual_32': ['s72', 4096], 'pow_34': ['s72', 4096], 'variance_33': ['s72', 1], 'add_101': ['s72', 1], 'rsqrt_33': ['s72', 1], 'x_182': ['s72', 4096], 'x_183': ['s72', 4096], 'x_184': ['s72', 4096], 'output_parallel_67': ['s72', 14336], 'getitem_474': ['s72', 7168], 'silu_16': ['s72', 7168], 'getitem_475': ['s72', 7168], 'x_185': ['s72', 7168], 'output_parallel_68': ['s72', 4096], 'output_102': ['s72', 4096], '_get_data_attr_34': [4096], 'x_186': ['s72', 4096], 'x_187': ['s72', 4096], 'residual_33': ['s72', 4096], 'pow_35': ['s72', 4096], 'variance_34': ['s72', 1], 'add_103': ['s72', 1], 'rsqrt_34': ['s72', 1], 'x_188': ['s72', 4096], 'x_189': ['s72', 4096], 'x_190': ['s72', 4096], 'output_parallel_69': ['s72', 3072], 'q_17': ['s72', 2048], 'k_17': ['s72', 512], 'v_17': ['s72', 512], 'positions_17': ['s72'], 'cos_sin_17': ['s72', 128], 'cos_51': ['s72', 64], 'sin_51': ['s72', 64], 'query_51': ['s72', 16, 128], 'query_rot_17': ['s72', 16, 128], 'query_pass_17': ['s72', 16, 0], 'unsqueeze_69': ['s72', 1, 64], 'cos_52': ['s72', 1, 64], 'unsqueeze_70': ['s72', 1, 64], 'sin_52': ['s72', 1, 64], 'x1_34': ['s72', 16, 64], 'x2_34': ['s72', 16, 64], 'mul_226': ['s72', 16, 64], 'mul_227': ['s72', 16, 64], 'o1_34': ['s72', 16, 64], 'mul_228': ['s72', 16, 64], 'mul_229': ['s72', 16, 64], 'o2_34': ['s72', 16, 64], 'output_103': ['s72', 16, 128], 'cat_69': ['s72', 16, 128], 'query_52': ['s72', 2048], 'key_51': ['s72', 4, 128], 'key_rot_17': ['s72', 4, 128], 'key_pass_17': ['s72', 4, 0], 'unsqueeze_71': ['s72', 1, 64], 'cos_53': ['s72', 1, 64], 'unsqueeze_72': ['s72', 1, 64], 'sin_53': ['s72', 1, 64], 'x1_35': ['s72', 4, 64], 'x2_35': ['s72', 4, 64], 'mul_230': ['s72', 4, 64], 'mul_231': ['s72', 4, 64], 'o1_35': ['s72', 4, 64], 'mul_232': ['s72', 4, 64], 'mul_233': ['s72', 4, 64], 'o2_35': ['s72', 4, 64], 'output_104': ['s72', 4, 128], 'cat_71': ['s72', 4, 128], 'key_52': ['s72', 512], 'output_105': ['s72', 2048], 'query_53': ['s72', 16, 128], 'output_106': ['s72', 16, 128], 'key_53': ['s72', 4, 128], 'value_17': ['s72', 4, 128], 'attn_output_17': ['s72', 2048], 'output_parallel_70': ['s72', 4096], 'output_107': ['s72', 4096], '_get_data_attr_35': [4096], 'x_191': ['s72', 4096], 'x_192': ['s72', 4096], 'residual_34': ['s72', 4096], 'pow_36': ['s72', 4096], 'variance_35': ['s72', 1], 'add_107': ['s72', 1], 'rsqrt_35': ['s72', 1], 'x_193': ['s72', 4096], 'x_194': ['s72', 4096], 'x_195': ['s72', 4096], 'output_parallel_71': ['s72', 14336], 'getitem_502': ['s72', 7168], 'silu_17': ['s72', 7168], 'getitem_503': ['s72', 7168], 'x_196': ['s72', 7168], 'output_parallel_72': ['s72', 4096], 'output_108': ['s72', 4096], '_get_data_attr_36': [4096], 'x_197': ['s72', 4096], 'x_198': ['s72', 4096], 'residual_35': ['s72', 4096], 'pow_37': ['s72', 4096], 'variance_36': ['s72', 1], 'add_109': ['s72', 1], 'rsqrt_36': ['s72', 1], 'x_199': ['s72', 4096], 'x_200': ['s72', 4096], 'x_201': ['s72', 4096], 'output_parallel_73': ['s72', 3072], 'q_18': ['s72', 2048], 'k_18': ['s72', 512], 'v_18': ['s72', 512], 'positions_18': ['s72'], 'cos_sin_18': ['s72', 128], 'cos_54': ['s72', 64], 'sin_54': ['s72', 64], 'query_54': ['s72', 16, 128], 'query_rot_18': ['s72', 16, 128], 'query_pass_18': ['s72', 16, 0], 'unsqueeze_73': ['s72', 1, 64], 'cos_55': ['s72', 1, 64], 'unsqueeze_74': ['s72', 1, 64], 'sin_55': ['s72', 1, 64], 'x1_36': ['s72', 16, 64], 'x2_36': ['s72', 16, 64], 'mul_239': ['s72', 16, 64], 'mul_240': ['s72', 16, 64], 'o1_36': ['s72', 16, 64], 'mul_241': ['s72', 16, 64], 'mul_242': ['s72', 16, 64], 'o2_36': ['s72', 16, 64], 'output_109': ['s72', 16, 128], 'cat_73': ['s72', 16, 128], 'query_55': ['s72', 2048], 'key_54': ['s72', 4, 128], 'key_rot_18': ['s72', 4, 128], 'key_pass_18': ['s72', 4, 0], 'unsqueeze_75': ['s72', 1, 64], 'cos_56': ['s72', 1, 64], 'unsqueeze_76': ['s72', 1, 64], 'sin_56': ['s72', 1, 64], 'x1_37': ['s72', 4, 64], 'x2_37': ['s72', 4, 64], 'mul_243': ['s72', 4, 64], 'mul_244': ['s72', 4, 64], 'o1_37': ['s72', 4, 64], 'mul_245': ['s72', 4, 64], 'mul_246': ['s72', 4, 64], 'o2_37': ['s72', 4, 64], 'output_110': ['s72', 4, 128], 'cat_75': ['s72', 4, 128], 'key_55': ['s72', 512], 'output_111': ['s72', 2048], 'query_56': ['s72', 16, 128], 'output_112': ['s72', 16, 128], 'key_56': ['s72', 4, 128], 'value_18': ['s72', 4, 128], 'attn_output_18': ['s72', 2048], 'output_parallel_74': ['s72', 4096], 'output_113': ['s72', 4096], '_get_data_attr_37': [4096], 'x_202': ['s72', 4096], 'x_203': ['s72', 4096], 'residual_36': ['s72', 4096], 'pow_38': ['s72', 4096], 'variance_37': ['s72', 1], 'add_113': ['s72', 1], 'rsqrt_37': ['s72', 1], 'x_204': ['s72', 4096], 'x_205': ['s72', 4096], 'x_206': ['s72', 4096], 'output_parallel_75': ['s72', 14336], 'getitem_530': ['s72', 7168], 'silu_18': ['s72', 7168], 'getitem_531': ['s72', 7168], 'x_207': ['s72', 7168], 'output_parallel_76': ['s72', 4096], 'output_114': ['s72', 4096], '_get_data_attr_38': [4096], 'x_208': ['s72', 4096], 'x_209': ['s72', 4096], 'residual_37': ['s72', 4096], 'pow_39': ['s72', 4096], 'variance_38': ['s72', 1], 'add_115': ['s72', 1], 'rsqrt_38': ['s72', 1], 'x_210': ['s72', 4096], 'x_211': ['s72', 4096], 'x_212': ['s72', 4096], 'output_parallel_77': ['s72', 3072], 'q_19': ['s72', 2048], 'k_19': ['s72', 512], 'v_19': ['s72', 512], 'positions_19': ['s72'], 'cos_sin_19': ['s72', 128], 'cos_57': ['s72', 64], 'sin_57': ['s72', 64], 'query_57': ['s72', 16, 128], 'query_rot_19': ['s72', 16, 128], 'query_pass_19': ['s72', 16, 0], 'unsqueeze_77': ['s72', 1, 64], 'cos_58': ['s72', 1, 64], 'unsqueeze_78': ['s72', 1, 64], 'sin_58': ['s72', 1, 64], 'x1_38': ['s72', 16, 64], 'x2_38': ['s72', 16, 64], 'mul_252': ['s72', 16, 64], 'mul_253': ['s72', 16, 64], 'o1_38': ['s72', 16, 64], 'mul_254': ['s72', 16, 64], 'mul_255': ['s72', 16, 64], 'o2_38': ['s72', 16, 64], 'output_115': ['s72', 16, 128], 'cat_77': ['s72', 16, 128], 'query_58': ['s72', 2048], 'key_57': ['s72', 4, 128], 'key_rot_19': ['s72', 4, 128], 'key_pass_19': ['s72', 4, 0], 'unsqueeze_79': ['s72', 1, 64], 'cos_59': ['s72', 1, 64], 'unsqueeze_80': ['s72', 1, 64], 'sin_59': ['s72', 1, 64], 'x1_39': ['s72', 4, 64], 'x2_39': ['s72', 4, 64], 'mul_256': ['s72', 4, 64], 'mul_257': ['s72', 4, 64], 'o1_39': ['s72', 4, 64], 'mul_258': ['s72', 4, 64], 'mul_259': ['s72', 4, 64], 'o2_39': ['s72', 4, 64], 'output_116': ['s72', 4, 128], 'cat_79': ['s72', 4, 128], 'key_58': ['s72', 512], 'output_117': ['s72', 2048], 'query_59': ['s72', 16, 128], 'output_118': ['s72', 16, 128], 'key_59': ['s72', 4, 128], 'value_19': ['s72', 4, 128], 'attn_output_19': ['s72', 2048], 'output_parallel_78': ['s72', 4096], 'output_119': ['s72', 4096], '_get_data_attr_39': [4096], 'x_213': ['s72', 4096], 'x_214': ['s72', 4096], 'residual_38': ['s72', 4096], 'pow_40': ['s72', 4096], 'variance_39': ['s72', 1], 'add_119': ['s72', 1], 'rsqrt_39': ['s72', 1], 'x_215': ['s72', 4096], 'x_216': ['s72', 4096], 'x_217': ['s72', 4096], 'output_parallel_79': ['s72', 14336], 'getitem_558': ['s72', 7168], 'silu_19': ['s72', 7168], 'getitem_559': ['s72', 7168], 'x_218': ['s72', 7168], 'output_parallel_80': ['s72', 4096], 'output_120': ['s72', 4096], '_get_data_attr_40': [4096], 'x_219': ['s72', 4096], 'x_220': ['s72', 4096], 'residual_39': ['s72', 4096], 'pow_41': ['s72', 4096], 'variance_40': ['s72', 1], 'add_121': ['s72', 1], 'rsqrt_40': ['s72', 1], 'x_221': ['s72', 4096], 'x_222': ['s72', 4096], 'x_223': ['s72', 4096], 'output_parallel_81': ['s72', 3072], 'q_20': ['s72', 2048], 'k_20': ['s72', 512], 'v_20': ['s72', 512], 'positions_20': ['s72'], 'cos_sin_20': ['s72', 128], 'cos_60': ['s72', 64], 'sin_60': ['s72', 64], 'query_60': ['s72', 16, 128], 'query_rot_20': ['s72', 16, 128], 'query_pass_20': ['s72', 16, 0], 'unsqueeze_81': ['s72', 1, 64], 'cos_61': ['s72', 1, 64], 'unsqueeze_82': ['s72', 1, 64], 'sin_61': ['s72', 1, 64], 'x1_40': ['s72', 16, 64], 'x2_40': ['s72', 16, 64], 'mul_265': ['s72', 16, 64], 'mul_266': ['s72', 16, 64], 'o1_40': ['s72', 16, 64], 'mul_267': ['s72', 16, 64], 'mul_268': ['s72', 16, 64], 'o2_40': ['s72', 16, 64], 'output_121': ['s72', 16, 128], 'cat_81': ['s72', 16, 128], 'query_61': ['s72', 2048], 'key_60': ['s72', 4, 128], 'key_rot_20': ['s72', 4, 128], 'key_pass_20': ['s72', 4, 0], 'unsqueeze_83': ['s72', 1, 64], 'cos_62': ['s72', 1, 64], 'unsqueeze_84': ['s72', 1, 64], 'sin_62': ['s72', 1, 64], 'x1_41': ['s72', 4, 64], 'x2_41': ['s72', 4, 64], 'mul_269': ['s72', 4, 64], 'mul_270': ['s72', 4, 64], 'o1_41': ['s72', 4, 64], 'mul_271': ['s72', 4, 64], 'mul_272': ['s72', 4, 64], 'o2_41': ['s72', 4, 64], 'output_122': ['s72', 4, 128], 'cat_83': ['s72', 4, 128], 'key_61': ['s72', 512], 'output_123': ['s72', 2048], 'query_62': ['s72', 16, 128], 'output_124': ['s72', 16, 128], 'key_62': ['s72', 4, 128], 'value_20': ['s72', 4, 128], 'attn_output_20': ['s72', 2048], 'output_parallel_82': ['s72', 4096], 'output_125': ['s72', 4096], '_get_data_attr_41': [4096], 'x_224': ['s72', 4096], 'x_225': ['s72', 4096], 'residual_40': ['s72', 4096], 'pow_42': ['s72', 4096], 'variance_41': ['s72', 1], 'add_125': ['s72', 1], 'rsqrt_41': ['s72', 1], 'x_226': ['s72', 4096], 'x_227': ['s72', 4096], 'x_228': ['s72', 4096], 'output_parallel_83': ['s72', 14336], 'getitem_586': ['s72', 7168], 'silu_20': ['s72', 7168], 'getitem_587': ['s72', 7168], 'x_229': ['s72', 7168], 'output_parallel_84': ['s72', 4096], 'output_126': ['s72', 4096], '_get_data_attr_42': [4096], 'x_230': ['s72', 4096], 'x_231': ['s72', 4096], 'residual_41': ['s72', 4096], 'pow_43': ['s72', 4096], 'variance_42': ['s72', 1], 'add_127': ['s72', 1], 'rsqrt_42': ['s72', 1], 'x_232': ['s72', 4096], 'x_233': ['s72', 4096], 'x_234': ['s72', 4096], 'output_parallel_85': ['s72', 3072], 'q_21': ['s72', 2048], 'k_21': ['s72', 512], 'v_21': ['s72', 512], 'positions_21': ['s72'], 'cos_sin_21': ['s72', 128], 'cos_63': ['s72', 64], 'sin_63': ['s72', 64], 'query_63': ['s72', 16, 128], 'query_rot_21': ['s72', 16, 128], 'query_pass_21': ['s72', 16, 0], 'unsqueeze_85': ['s72', 1, 64], 'cos_64': ['s72', 1, 64], 'unsqueeze_86': ['s72', 1, 64], 'sin_64': ['s72', 1, 64], 'x1_42': ['s72', 16, 64], 'x2_42': ['s72', 16, 64], 'mul_278': ['s72', 16, 64], 'mul_279': ['s72', 16, 64], 'o1_42': ['s72', 16, 64], 'mul_280': ['s72', 16, 64], 'mul_281': ['s72', 16, 64], 'o2_42': ['s72', 16, 64], 'output_127': ['s72', 16, 128], 'cat_85': ['s72', 16, 128], 'query_64': ['s72', 2048], 'key_63': ['s72', 4, 128], 'key_rot_21': ['s72', 4, 128], 'key_pass_21': ['s72', 4, 0], 'unsqueeze_87': ['s72', 1, 64], 'cos_65': ['s72', 1, 64], 'unsqueeze_88': ['s72', 1, 64], 'sin_65': ['s72', 1, 64], 'x1_43': ['s72', 4, 64], 'x2_43': ['s72', 4, 64], 'mul_282': ['s72', 4, 64], 'mul_283': ['s72', 4, 64], 'o1_43': ['s72', 4, 64], 'mul_284': ['s72', 4, 64], 'mul_285': ['s72', 4, 64], 'o2_43': ['s72', 4, 64], 'output_128': ['s72', 4, 128], 'cat_87': ['s72', 4, 128], 'key_64': ['s72', 512], 'output_129': ['s72', 2048], 'query_65': ['s72', 16, 128], 'output_130': ['s72', 16, 128], 'key_65': ['s72', 4, 128], 'value_21': ['s72', 4, 128], 'attn_output_21': ['s72', 2048], 'output_parallel_86': ['s72', 4096], 'output_131': ['s72', 4096], '_get_data_attr_43': [4096], 'x_235': ['s72', 4096], 'x_236': ['s72', 4096], 'residual_42': ['s72', 4096], 'pow_44': ['s72', 4096], 'variance_43': ['s72', 1], 'add_131': ['s72', 1], 'rsqrt_43': ['s72', 1], 'x_237': ['s72', 4096], 'x_238': ['s72', 4096], 'x_239': ['s72', 4096], 'output_parallel_87': ['s72', 14336], 'getitem_614': ['s72', 7168], 'silu_21': ['s72', 7168], 'getitem_615': ['s72', 7168], 'x_240': ['s72', 7168], 'output_parallel_88': ['s72', 4096], 'output_132': ['s72', 4096], '_get_data_attr_44': [4096], 'x_241': ['s72', 4096], 'x_242': ['s72', 4096], 'residual_43': ['s72', 4096], 'pow_45': ['s72', 4096], 'variance_44': ['s72', 1], 'add_133': ['s72', 1], 'rsqrt_44': ['s72', 1], 'x_243': ['s72', 4096], 'x_244': ['s72', 4096], 'x_245': ['s72', 4096], 'output_parallel_89': ['s72', 3072], 'q_22': ['s72', 2048], 'k_22': ['s72', 512], 'v_22': ['s72', 512], 'positions_22': ['s72'], 'cos_sin_22': ['s72', 128], 'cos_66': ['s72', 64], 'sin_66': ['s72', 64], 'query_66': ['s72', 16, 128], 'query_rot_22': ['s72', 16, 128], 'query_pass_22': ['s72', 16, 0], 'unsqueeze_89': ['s72', 1, 64], 'cos_67': ['s72', 1, 64], 'unsqueeze_90': ['s72', 1, 64], 'sin_67': ['s72', 1, 64], 'x1_44': ['s72', 16, 64], 'x2_44': ['s72', 16, 64], 'mul_291': ['s72', 16, 64], 'mul_292': ['s72', 16, 64], 'o1_44': ['s72', 16, 64], 'mul_293': ['s72', 16, 64], 'mul_294': ['s72', 16, 64], 'o2_44': ['s72', 16, 64], 'output_133': ['s72', 16, 128], 'cat_89': ['s72', 16, 128], 'query_67': ['s72', 2048], 'key_66': ['s72', 4, 128], 'key_rot_22': ['s72', 4, 128], 'key_pass_22': ['s72', 4, 0], 'unsqueeze_91': ['s72', 1, 64], 'cos_68': ['s72', 1, 64], 'unsqueeze_92': ['s72', 1, 64], 'sin_68': ['s72', 1, 64], 'x1_45': ['s72', 4, 64], 'x2_45': ['s72', 4, 64], 'mul_295': ['s72', 4, 64], 'mul_296': ['s72', 4, 64], 'o1_45': ['s72', 4, 64], 'mul_297': ['s72', 4, 64], 'mul_298': ['s72', 4, 64], 'o2_45': ['s72', 4, 64], 'output_134': ['s72', 4, 128], 'cat_91': ['s72', 4, 128], 'key_67': ['s72', 512], 'output_135': ['s72', 2048], 'query_68': ['s72', 16, 128], 'output_136': ['s72', 16, 128], 'key_68': ['s72', 4, 128], 'value_22': ['s72', 4, 128], 'attn_output_22': ['s72', 2048], 'output_parallel_90': ['s72', 4096], 'output_137': ['s72', 4096], '_get_data_attr_45': [4096], 'x_246': ['s72', 4096], 'x_247': ['s72', 4096], 'residual_44': ['s72', 4096], 'pow_46': ['s72', 4096], 'variance_45': ['s72', 1], 'add_137': ['s72', 1], 'rsqrt_45': ['s72', 1], 'x_248': ['s72', 4096], 'x_249': ['s72', 4096], 'x_250': ['s72', 4096], 'output_parallel_91': ['s72', 14336], 'getitem_642': ['s72', 7168], 'silu_22': ['s72', 7168], 'getitem_643': ['s72', 7168], 'x_251': ['s72', 7168], 'output_parallel_92': ['s72', 4096], 'output_138': ['s72', 4096], '_get_data_attr_46': [4096], 'x_252': ['s72', 4096], 'x_253': ['s72', 4096], 'residual_45': ['s72', 4096], 'pow_47': ['s72', 4096], 'variance_46': ['s72', 1], 'add_139': ['s72', 1], 'rsqrt_46': ['s72', 1], 'x_254': ['s72', 4096], 'x_255': ['s72', 4096], 'x_256': ['s72', 4096], 'output_parallel_93': ['s72', 3072], 'q_23': ['s72', 2048], 'k_23': ['s72', 512], 'v_23': ['s72', 512], 'positions_23': ['s72'], 'cos_sin_23': ['s72', 128], 'cos_69': ['s72', 64], 'sin_69': ['s72', 64], 'query_69': ['s72', 16, 128], 'query_rot_23': ['s72', 16, 128], 'query_pass_23': ['s72', 16, 0], 'unsqueeze_93': ['s72', 1, 64], 'cos_70': ['s72', 1, 64], 'unsqueeze_94': ['s72', 1, 64], 'sin_70': ['s72', 1, 64], 'x1_46': ['s72', 16, 64], 'x2_46': ['s72', 16, 64], 'mul_304': ['s72', 16, 64], 'mul_305': ['s72', 16, 64], 'o1_46': ['s72', 16, 64], 'mul_306': ['s72', 16, 64], 'mul_307': ['s72', 16, 64], 'o2_46': ['s72', 16, 64], 'output_139': ['s72', 16, 128], 'cat_93': ['s72', 16, 128], 'query_70': ['s72', 2048], 'key_69': ['s72', 4, 128], 'key_rot_23': ['s72', 4, 128], 'key_pass_23': ['s72', 4, 0], 'unsqueeze_95': ['s72', 1, 64], 'cos_71': ['s72', 1, 64], 'unsqueeze_96': ['s72', 1, 64], 'sin_71': ['s72', 1, 64], 'x1_47': ['s72', 4, 64], 'x2_47': ['s72', 4, 64], 'mul_308': ['s72', 4, 64], 'mul_309': ['s72', 4, 64], 'o1_47': ['s72', 4, 64], 'mul_310': ['s72', 4, 64], 'mul_311': ['s72', 4, 64], 'o2_47': ['s72', 4, 64], 'output_140': ['s72', 4, 128], 'cat_95': ['s72', 4, 128], 'key_70': ['s72', 512], 'output_141': ['s72', 2048], 'query_71': ['s72', 16, 128], 'output_142': ['s72', 16, 128], 'key_71': ['s72', 4, 128], 'value_23': ['s72', 4, 128], 'attn_output_23': ['s72', 2048], 'output_parallel_94': ['s72', 4096], 'output_143': ['s72', 4096], '_get_data_attr_47': [4096], 'x_257': ['s72', 4096], 'x_258': ['s72', 4096], 'residual_46': ['s72', 4096], 'pow_48': ['s72', 4096], 'variance_47': ['s72', 1], 'add_143': ['s72', 1], 'rsqrt_47': ['s72', 1], 'x_259': ['s72', 4096], 'x_260': ['s72', 4096], 'x_261': ['s72', 4096], 'output_parallel_95': ['s72', 14336], 'getitem_670': ['s72', 7168], 'silu_23': ['s72', 7168], 'getitem_671': ['s72', 7168], 'x_262': ['s72', 7168], 'output_parallel_96': ['s72', 4096], 'output_144': ['s72', 4096], '_get_data_attr_48': [4096], 'x_263': ['s72', 4096], 'x_264': ['s72', 4096], 'residual_47': ['s72', 4096], 'pow_49': ['s72', 4096], 'variance_48': ['s72', 1], 'add_145': ['s72', 1], 'rsqrt_48': ['s72', 1], 'x_265': ['s72', 4096], 'x_266': ['s72', 4096], 'x_267': ['s72', 4096], 'output_parallel_97': ['s72', 3072], 'q_24': ['s72', 2048], 'k_24': ['s72', 512], 'v_24': ['s72', 512], 'positions_24': ['s72'], 'cos_sin_24': ['s72', 128], 'cos_72': ['s72', 64], 'sin_72': ['s72', 64], 'query_72': ['s72', 16, 128], 'query_rot_24': ['s72', 16, 128], 'query_pass_24': ['s72', 16, 0], 'unsqueeze_97': ['s72', 1, 64], 'cos_73': ['s72', 1, 64], 'unsqueeze_98': ['s72', 1, 64], 'sin_73': ['s72', 1, 64], 'x1_48': ['s72', 16, 64], 'x2_48': ['s72', 16, 64], 'mul_317': ['s72', 16, 64], 'mul_318': ['s72', 16, 64], 'o1_48': ['s72', 16, 64], 'mul_319': ['s72', 16, 64], 'mul_320': ['s72', 16, 64], 'o2_48': ['s72', 16, 64], 'output_145': ['s72', 16, 128], 'cat_97': ['s72', 16, 128], 'query_73': ['s72', 2048], 'key_72': ['s72', 4, 128], 'key_rot_24': ['s72', 4, 128], 'key_pass_24': ['s72', 4, 0], 'unsqueeze_99': ['s72', 1, 64], 'cos_74': ['s72', 1, 64], 'unsqueeze_100': ['s72', 1, 64], 'sin_74': ['s72', 1, 64], 'x1_49': ['s72', 4, 64], 'x2_49': ['s72', 4, 64], 'mul_321': ['s72', 4, 64], 'mul_322': ['s72', 4, 64], 'o1_49': ['s72', 4, 64], 'mul_323': ['s72', 4, 64], 'mul_324': ['s72', 4, 64], 'o2_49': ['s72', 4, 64], 'output_146': ['s72', 4, 128], 'cat_99': ['s72', 4, 128], 'key_73': ['s72', 512], 'output_147': ['s72', 2048], 'query_74': ['s72', 16, 128], 'output_148': ['s72', 16, 128], 'key_74': ['s72', 4, 128], 'value_24': ['s72', 4, 128], 'attn_output_24': ['s72', 2048], 'output_parallel_98': ['s72', 4096], 'output_149': ['s72', 4096], '_get_data_attr_49': [4096], 'x_268': ['s72', 4096], 'x_269': ['s72', 4096], 'residual_48': ['s72', 4096], 'pow_50': ['s72', 4096], 'variance_49': ['s72', 1], 'add_149': ['s72', 1], 'rsqrt_49': ['s72', 1], 'x_270': ['s72', 4096], 'x_271': ['s72', 4096], 'x_272': ['s72', 4096], 'output_parallel_99': ['s72', 14336], 'getitem_698': ['s72', 7168], 'silu_24': ['s72', 7168], 'getitem_699': ['s72', 7168], 'x_273': ['s72', 7168], 'output_parallel_100': ['s72', 4096], 'output_150': ['s72', 4096], '_get_data_attr_50': [4096], 'x_274': ['s72', 4096], 'x_275': ['s72', 4096], 'residual_49': ['s72', 4096], 'pow_51': ['s72', 4096], 'variance_50': ['s72', 1], 'add_151': ['s72', 1], 'rsqrt_50': ['s72', 1], 'x_276': ['s72', 4096], 'x_277': ['s72', 4096], 'x_278': ['s72', 4096], 'output_parallel_101': ['s72', 3072], 'q_25': ['s72', 2048], 'k_25': ['s72', 512], 'v_25': ['s72', 512], 'positions_25': ['s72'], 'cos_sin_25': ['s72', 128], 'cos_75': ['s72', 64], 'sin_75': ['s72', 64], 'query_75': ['s72', 16, 128], 'query_rot_25': ['s72', 16, 128], 'query_pass_25': ['s72', 16, 0], 'unsqueeze_101': ['s72', 1, 64], 'cos_76': ['s72', 1, 64], 'unsqueeze_102': ['s72', 1, 64], 'sin_76': ['s72', 1, 64], 'x1_50': ['s72', 16, 64], 'x2_50': ['s72', 16, 64], 'mul_330': ['s72', 16, 64], 'mul_331': ['s72', 16, 64], 'o1_50': ['s72', 16, 64], 'mul_332': ['s72', 16, 64], 'mul_333': ['s72', 16, 64], 'o2_50': ['s72', 16, 64], 'output_151': ['s72', 16, 128], 'cat_101': ['s72', 16, 128], 'query_76': ['s72', 2048], 'key_75': ['s72', 4, 128], 'key_rot_25': ['s72', 4, 128], 'key_pass_25': ['s72', 4, 0], 'unsqueeze_103': ['s72', 1, 64], 'cos_77': ['s72', 1, 64], 'unsqueeze_104': ['s72', 1, 64], 'sin_77': ['s72', 1, 64], 'x1_51': ['s72', 4, 64], 'x2_51': ['s72', 4, 64], 'mul_334': ['s72', 4, 64], 'mul_335': ['s72', 4, 64], 'o1_51': ['s72', 4, 64], 'mul_336': ['s72', 4, 64], 'mul_337': ['s72', 4, 64], 'o2_51': ['s72', 4, 64], 'output_152': ['s72', 4, 128], 'cat_103': ['s72', 4, 128], 'key_76': ['s72', 512], 'output_153': ['s72', 2048], 'query_77': ['s72', 16, 128], 'output_154': ['s72', 16, 128], 'key_77': ['s72', 4, 128], 'value_25': ['s72', 4, 128], 'attn_output_25': ['s72', 2048], 'output_parallel_102': ['s72', 4096], 'output_155': ['s72', 4096], '_get_data_attr_51': [4096], 'x_279': ['s72', 4096], 'x_280': ['s72', 4096], 'residual_50': ['s72', 4096], 'pow_52': ['s72', 4096], 'variance_51': ['s72', 1], 'add_155': ['s72', 1], 'rsqrt_51': ['s72', 1], 'x_281': ['s72', 4096], 'x_282': ['s72', 4096], 'x_283': ['s72', 4096], 'output_parallel_103': ['s72', 14336], 'getitem_726': ['s72', 7168], 'silu_25': ['s72', 7168], 'getitem_727': ['s72', 7168], 'x_284': ['s72', 7168], 'output_parallel_104': ['s72', 4096], 'output_156': ['s72', 4096], '_get_data_attr_52': [4096], 'x_285': ['s72', 4096], 'x_286': ['s72', 4096], 'residual_51': ['s72', 4096], 'pow_53': ['s72', 4096], 'variance_52': ['s72', 1], 'add_157': ['s72', 1], 'rsqrt_52': ['s72', 1], 'x_287': ['s72', 4096], 'x_288': ['s72', 4096], 'x_289': ['s72', 4096], 'output_parallel_105': ['s72', 3072], 'q_26': ['s72', 2048], 'k_26': ['s72', 512], 'v_26': ['s72', 512], 'positions_26': ['s72'], 'cos_sin_26': ['s72', 128], 'cos_78': ['s72', 64], 'sin_78': ['s72', 64], 'query_78': ['s72', 16, 128], 'query_rot_26': ['s72', 16, 128], 'query_pass_26': ['s72', 16, 0], 'unsqueeze_105': ['s72', 1, 64], 'cos_79': ['s72', 1, 64], 'unsqueeze_106': ['s72', 1, 64], 'sin_79': ['s72', 1, 64], 'x1_52': ['s72', 16, 64], 'x2_52': ['s72', 16, 64], 'mul_343': ['s72', 16, 64], 'mul_344': ['s72', 16, 64], 'o1_52': ['s72', 16, 64], 'mul_345': ['s72', 16, 64], 'mul_346': ['s72', 16, 64], 'o2_52': ['s72', 16, 64], 'output_157': ['s72', 16, 128], 'cat_105': ['s72', 16, 128], 'query_79': ['s72', 2048], 'key_78': ['s72', 4, 128], 'key_rot_26': ['s72', 4, 128], 'key_pass_26': ['s72', 4, 0], 'unsqueeze_107': ['s72', 1, 64], 'cos_80': ['s72', 1, 64], 'unsqueeze_108': ['s72', 1, 64], 'sin_80': ['s72', 1, 64], 'x1_53': ['s72', 4, 64], 'x2_53': ['s72', 4, 64], 'mul_347': ['s72', 4, 64], 'mul_348': ['s72', 4, 64], 'o1_53': ['s72', 4, 64], 'mul_349': ['s72', 4, 64], 'mul_350': ['s72', 4, 64], 'o2_53': ['s72', 4, 64], 'output_158': ['s72', 4, 128], 'cat_107': ['s72', 4, 128], 'key_79': ['s72', 512], 'output_159': ['s72', 2048], 'query_80': ['s72', 16, 128], 'output_160': ['s72', 16, 128], 'key_80': ['s72', 4, 128], 'value_26': ['s72', 4, 128], 'attn_output_26': ['s72', 2048], 'output_parallel_106': ['s72', 4096], 'output_161': ['s72', 4096], '_get_data_attr_53': [4096], 'x_290': ['s72', 4096], 'x_291': ['s72', 4096], 'residual_52': ['s72', 4096], 'pow_54': ['s72', 4096], 'variance_53': ['s72', 1], 'add_161': ['s72', 1], 'rsqrt_53': ['s72', 1], 'x_292': ['s72', 4096], 'x_293': ['s72', 4096], 'x_294': ['s72', 4096], 'output_parallel_107': ['s72', 14336], 'getitem_754': ['s72', 7168], 'silu_26': ['s72', 7168], 'getitem_755': ['s72', 7168], 'x_295': ['s72', 7168], 'output_parallel_108': ['s72', 4096], 'output_162': ['s72', 4096], '_get_data_attr_54': [4096], 'x_296': ['s72', 4096], 'x_297': ['s72', 4096], 'residual_53': ['s72', 4096], 'pow_55': ['s72', 4096], 'variance_54': ['s72', 1], 'add_163': ['s72', 1], 'rsqrt_54': ['s72', 1], 'x_298': ['s72', 4096], 'x_299': ['s72', 4096], 'x_300': ['s72', 4096], 'output_parallel_109': ['s72', 3072], 'q_27': ['s72', 2048], 'k_27': ['s72', 512], 'v_27': ['s72', 512], 'positions_27': ['s72'], 'cos_sin_27': ['s72', 128], 'cos_81': ['s72', 64], 'sin_81': ['s72', 64], 'query_81': ['s72', 16, 128], 'query_rot_27': ['s72', 16, 128], 'query_pass_27': ['s72', 16, 0], 'unsqueeze_109': ['s72', 1, 64], 'cos_82': ['s72', 1, 64], 'unsqueeze_110': ['s72', 1, 64], 'sin_82': ['s72', 1, 64], 'x1_54': ['s72', 16, 64], 'x2_54': ['s72', 16, 64], 'mul_356': ['s72', 16, 64], 'mul_357': ['s72', 16, 64], 'o1_54': ['s72', 16, 64], 'mul_358': ['s72', 16, 64], 'mul_359': ['s72', 16, 64], 'o2_54': ['s72', 16, 64], 'output_163': ['s72', 16, 128], 'cat_109': ['s72', 16, 128], 'query_82': ['s72', 2048], 'key_81': ['s72', 4, 128], 'key_rot_27': ['s72', 4, 128], 'key_pass_27': ['s72', 4, 0], 'unsqueeze_111': ['s72', 1, 64], 'cos_83': ['s72', 1, 64], 'unsqueeze_112': ['s72', 1, 64], 'sin_83': ['s72', 1, 64], 'x1_55': ['s72', 4, 64], 'x2_55': ['s72', 4, 64], 'mul_360': ['s72', 4, 64], 'mul_361': ['s72', 4, 64], 'o1_55': ['s72', 4, 64], 'mul_362': ['s72', 4, 64], 'mul_363': ['s72', 4, 64], 'o2_55': ['s72', 4, 64], 'output_164': ['s72', 4, 128], 'cat_111': ['s72', 4, 128], 'key_82': ['s72', 512], 'output_165': ['s72', 2048], 'query_83': ['s72', 16, 128], 'output_166': ['s72', 16, 128], 'key_83': ['s72', 4, 128], 'value_27': ['s72', 4, 128], 'attn_output_27': ['s72', 2048], 'output_parallel_110': ['s72', 4096], 'output_167': ['s72', 4096], '_get_data_attr_55': [4096], 'x_301': ['s72', 4096], 'x_302': ['s72', 4096], 'residual_54': ['s72', 4096], 'pow_56': ['s72', 4096], 'variance_55': ['s72', 1], 'add_167': ['s72', 1], 'rsqrt_55': ['s72', 1], 'x_303': ['s72', 4096], 'x_304': ['s72', 4096], 'x_305': ['s72', 4096], 'output_parallel_111': ['s72', 14336], 'getitem_782': ['s72', 7168], 'silu_27': ['s72', 7168], 'getitem_783': ['s72', 7168], 'x_306': ['s72', 7168], 'output_parallel_112': ['s72', 4096], 'output_168': ['s72', 4096], '_get_data_attr_56': [4096], 'x_307': ['s72', 4096], 'x_308': ['s72', 4096], 'residual_55': ['s72', 4096], 'pow_57': ['s72', 4096], 'variance_56': ['s72', 1], 'add_169': ['s72', 1], 'rsqrt_56': ['s72', 1], 'x_309': ['s72', 4096], 'x_310': ['s72', 4096], 'x_311': ['s72', 4096], 'output_parallel_113': ['s72', 3072], 'q_28': ['s72', 2048], 'k_28': ['s72', 512], 'v_28': ['s72', 512], 'positions_28': ['s72'], 'cos_sin_28': ['s72', 128], 'cos_84': ['s72', 64], 'sin_84': ['s72', 64], 'query_84': ['s72', 16, 128], 'query_rot_28': ['s72', 16, 128], 'query_pass_28': ['s72', 16, 0], 'unsqueeze_113': ['s72', 1, 64], 'cos_85': ['s72', 1, 64], 'unsqueeze_114': ['s72', 1, 64], 'sin_85': ['s72', 1, 64], 'x1_56': ['s72', 16, 64], 'x2_56': ['s72', 16, 64], 'mul_369': ['s72', 16, 64], 'mul_370': ['s72', 16, 64], 'o1_56': ['s72', 16, 64], 'mul_371': ['s72', 16, 64], 'mul_372': ['s72', 16, 64], 'o2_56': ['s72', 16, 64], 'output_169': ['s72', 16, 128], 'cat_113': ['s72', 16, 128], 'query_85': ['s72', 2048], 'key_84': ['s72', 4, 128], 'key_rot_28': ['s72', 4, 128], 'key_pass_28': ['s72', 4, 0], 'unsqueeze_115': ['s72', 1, 64], 'cos_86': ['s72', 1, 64], 'unsqueeze_116': ['s72', 1, 64], 'sin_86': ['s72', 1, 64], 'x1_57': ['s72', 4, 64], 'x2_57': ['s72', 4, 64], 'mul_373': ['s72', 4, 64], 'mul_374': ['s72', 4, 64], 'o1_57': ['s72', 4, 64], 'mul_375': ['s72', 4, 64], 'mul_376': ['s72', 4, 64], 'o2_57': ['s72', 4, 64], 'output_170': ['s72', 4, 128], 'cat_115': ['s72', 4, 128], 'key_85': ['s72', 512], 'output_171': ['s72', 2048], 'query_86': ['s72', 16, 128], 'output_172': ['s72', 16, 128], 'key_86': ['s72', 4, 128], 'value_28': ['s72', 4, 128], 'attn_output_28': ['s72', 2048], 'output_parallel_114': ['s72', 4096], 'output_173': ['s72', 4096], '_get_data_attr_57': [4096], 'x_312': ['s72', 4096], 'x_313': ['s72', 4096], 'residual_56': ['s72', 4096], 'pow_58': ['s72', 4096], 'variance_57': ['s72', 1], 'add_173': ['s72', 1], 'rsqrt_57': ['s72', 1], 'x_314': ['s72', 4096], 'x_315': ['s72', 4096], 'x_316': ['s72', 4096], 'output_parallel_115': ['s72', 14336], 'getitem_810': ['s72', 7168], 'silu_28': ['s72', 7168], 'getitem_811': ['s72', 7168], 'x_317': ['s72', 7168], 'output_parallel_116': ['s72', 4096], 'output_174': ['s72', 4096], '_get_data_attr_58': [4096], 'x_318': ['s72', 4096], 'x_319': ['s72', 4096], 'residual_57': ['s72', 4096], 'pow_59': ['s72', 4096], 'variance_58': ['s72', 1], 'add_175': ['s72', 1], 'rsqrt_58': ['s72', 1], 'x_320': ['s72', 4096], 'x_321': ['s72', 4096], 'x_322': ['s72', 4096], 'output_parallel_117': ['s72', 3072], 'q_29': ['s72', 2048], 'k_29': ['s72', 512], 'v_29': ['s72', 512], 'positions_29': ['s72'], 'cos_sin_29': ['s72', 128], 'cos_87': ['s72', 64], 'sin_87': ['s72', 64], 'query_87': ['s72', 16, 128], 'query_rot_29': ['s72', 16, 128], 'query_pass_29': ['s72', 16, 0], 'unsqueeze_117': ['s72', 1, 64], 'cos_88': ['s72', 1, 64], 'unsqueeze_118': ['s72', 1, 64], 'sin_88': ['s72', 1, 64], 'x1_58': ['s72', 16, 64], 'x2_58': ['s72', 16, 64], 'mul_382': ['s72', 16, 64], 'mul_383': ['s72', 16, 64], 'o1_58': ['s72', 16, 64], 'mul_384': ['s72', 16, 64], 'mul_385': ['s72', 16, 64], 'o2_58': ['s72', 16, 64], 'output_175': ['s72', 16, 128], 'cat_117': ['s72', 16, 128], 'query_88': ['s72', 2048], 'key_87': ['s72', 4, 128], 'key_rot_29': ['s72', 4, 128], 'key_pass_29': ['s72', 4, 0], 'unsqueeze_119': ['s72', 1, 64], 'cos_89': ['s72', 1, 64], 'unsqueeze_120': ['s72', 1, 64], 'sin_89': ['s72', 1, 64], 'x1_59': ['s72', 4, 64], 'x2_59': ['s72', 4, 64], 'mul_386': ['s72', 4, 64], 'mul_387': ['s72', 4, 64], 'o1_59': ['s72', 4, 64], 'mul_388': ['s72', 4, 64], 'mul_389': ['s72', 4, 64], 'o2_59': ['s72', 4, 64], 'output_176': ['s72', 4, 128], 'cat_119': ['s72', 4, 128], 'key_88': ['s72', 512], 'output_177': ['s72', 2048], 'query_89': ['s72', 16, 128], 'output_178': ['s72', 16, 128], 'key_89': ['s72', 4, 128], 'value_29': ['s72', 4, 128], 'attn_output_29': ['s72', 2048], 'output_parallel_118': ['s72', 4096], 'output_179': ['s72', 4096], '_get_data_attr_59': [4096], 'x_323': ['s72', 4096], 'x_324': ['s72', 4096], 'residual_58': ['s72', 4096], 'pow_60': ['s72', 4096], 'variance_59': ['s72', 1], 'add_179': ['s72', 1], 'rsqrt_59': ['s72', 1], 'x_325': ['s72', 4096], 'x_326': ['s72', 4096], 'x_327': ['s72', 4096], 'output_parallel_119': ['s72', 14336], 'getitem_838': ['s72', 7168], 'silu_29': ['s72', 7168], 'getitem_839': ['s72', 7168], 'x_328': ['s72', 7168], 'output_parallel_120': ['s72', 4096], 'output_180': ['s72', 4096], '_get_data_attr_60': [4096], 'x_329': ['s72', 4096], 'x_330': ['s72', 4096], 'residual_59': ['s72', 4096], 'pow_61': ['s72', 4096], 'variance_60': ['s72', 1], 'add_181': ['s72', 1], 'rsqrt_60': ['s72', 1], 'x_331': ['s72', 4096], 'x_332': ['s72', 4096], 'x_333': ['s72', 4096], 'output_parallel_121': ['s72', 3072], 'q_30': ['s72', 2048], 'k_30': ['s72', 512], 'v_30': ['s72', 512], 'positions_30': ['s72'], 'cos_sin_30': ['s72', 128], 'cos_90': ['s72', 64], 'sin_90': ['s72', 64], 'query_90': ['s72', 16, 128], 'query_rot_30': ['s72', 16, 128], 'query_pass_30': ['s72', 16, 0], 'unsqueeze_121': ['s72', 1, 64], 'cos_91': ['s72', 1, 64], 'unsqueeze_122': ['s72', 1, 64], 'sin_91': ['s72', 1, 64], 'x1_60': ['s72', 16, 64], 'x2_60': ['s72', 16, 64], 'mul_395': ['s72', 16, 64], 'mul_396': ['s72', 16, 64], 'o1_60': ['s72', 16, 64], 'mul_397': ['s72', 16, 64], 'mul_398': ['s72', 16, 64], 'o2_60': ['s72', 16, 64], 'output_181': ['s72', 16, 128], 'cat_121': ['s72', 16, 128], 'query_91': ['s72', 2048], 'key_90': ['s72', 4, 128], 'key_rot_30': ['s72', 4, 128], 'key_pass_30': ['s72', 4, 0], 'unsqueeze_123': ['s72', 1, 64], 'cos_92': ['s72', 1, 64], 'unsqueeze_124': ['s72', 1, 64], 'sin_92': ['s72', 1, 64], 'x1_61': ['s72', 4, 64], 'x2_61': ['s72', 4, 64], 'mul_399': ['s72', 4, 64], 'mul_400': ['s72', 4, 64], 'o1_61': ['s72', 4, 64], 'mul_401': ['s72', 4, 64], 'mul_402': ['s72', 4, 64], 'o2_61': ['s72', 4, 64], 'output_182': ['s72', 4, 128], 'cat_123': ['s72', 4, 128], 'key_91': ['s72', 512], 'output_183': ['s72', 2048], 'query_92': ['s72', 16, 128], 'output_184': ['s72', 16, 128], 'key_92': ['s72', 4, 128], 'value_30': ['s72', 4, 128], 'attn_output_30': ['s72', 2048], 'output_parallel_122': ['s72', 4096], 'output_185': ['s72', 4096], '_get_data_attr_61': [4096], 'x_334': ['s72', 4096], 'x_335': ['s72', 4096], 'residual_60': ['s72', 4096], 'pow_62': ['s72', 4096], 'variance_61': ['s72', 1], 'add_185': ['s72', 1], 'rsqrt_61': ['s72', 1], 'x_336': ['s72', 4096], 'x_337': ['s72', 4096], 'x_338': ['s72', 4096], 'output_parallel_123': ['s72', 14336], 'getitem_866': ['s72', 7168], 'silu_30': ['s72', 7168], 'getitem_867': ['s72', 7168], 'x_339': ['s72', 7168], 'output_parallel_124': ['s72', 4096], 'output_186': ['s72', 4096], '_get_data_attr_62': [4096], 'x_340': ['s72', 4096], 'x_341': ['s72', 4096], 'residual_61': ['s72', 4096], 'pow_63': ['s72', 4096], 'variance_62': ['s72', 1], 'add_187': ['s72', 1], 'rsqrt_62': ['s72', 1], 'x_342': ['s72', 4096], 'x_343': ['s72', 4096], 'x_344': ['s72', 4096], 'output_parallel_125': ['s72', 3072], 'q_31': ['s72', 2048], 'k_31': ['s72', 512], 'v_31': ['s72', 512], 'positions_31': ['s72'], 'cos_sin_31': ['s72', 128], 'cos_93': ['s72', 64], 'sin_93': ['s72', 64], 'query_93': ['s72', 16, 128], 'query_rot_31': ['s72', 16, 128], 'query_pass_31': ['s72', 16, 0], 'unsqueeze_125': ['s72', 1, 64], 'cos_94': ['s72', 1, 64], 'unsqueeze_126': ['s72', 1, 64], 'sin_94': ['s72', 1, 64], 'x1_62': ['s72', 16, 64], 'x2_62': ['s72', 16, 64], 'mul_408': ['s72', 16, 64], 'mul_409': ['s72', 16, 64], 'o1_62': ['s72', 16, 64], 'mul_410': ['s72', 16, 64], 'mul_411': ['s72', 16, 64], 'o2_62': ['s72', 16, 64], 'output_187': ['s72', 16, 128], 'cat_125': ['s72', 16, 128], 'query_94': ['s72', 2048], 'key_93': ['s72', 4, 128], 'key_rot_31': ['s72', 4, 128], 'key_pass_31': ['s72', 4, 0], 'unsqueeze_127': ['s72', 1, 64], 'cos_95': ['s72', 1, 64], 'unsqueeze_128': ['s72', 1, 64], 'sin_95': ['s72', 1, 64], 'x1_63': ['s72', 4, 64], 'x2_63': ['s72', 4, 64], 'mul_412': ['s72', 4, 64], 'mul_413': ['s72', 4, 64], 'o1_63': ['s72', 4, 64], 'mul_414': ['s72', 4, 64], 'mul_415': ['s72', 4, 64], 'o2_63': ['s72', 4, 64], 'output_188': ['s72', 4, 128], 'cat_127': ['s72', 4, 128], 'key_94': ['s72', 512], 'output_189': ['s72', 2048], 'query_95': ['s72', 16, 128], 'output_190': ['s72', 16, 128], 'key_95': ['s72', 4, 128], 'value_31': ['s72', 4, 128], 'attn_output_31': ['s72', 2048], 'output_parallel_126': ['s72', 4096], 'output_191': ['s72', 4096], '_get_data_attr_63': [4096], 'x_345': ['s72', 4096], 'x_346': ['s72', 4096], 'residual_62': ['s72', 4096], 'pow_64': ['s72', 4096], 'variance_63': ['s72', 1], 'add_191': ['s72', 1], 'rsqrt_63': ['s72', 1], 'x_347': ['s72', 4096], 'x_348': ['s72', 4096], 'x_349': ['s72', 4096], 'output_parallel_127': ['s72', 14336], 'getitem_894': ['s72', 7168], 'silu_31': ['s72', 7168], 'getitem_895': ['s72', 7168], 'x_350': ['s72', 7168], 'output_parallel_128': ['s72', 4096], 'output_192': ['s72', 4096], '_get_data_attr_64': [4096], 'x_351': ['s72', 4096], 'x_352': ['s72', 4096], 'residual_63': ['s72', 4096], 'pow_65': ['s72', 4096], 'variance_64': ['s72', 1], 'add_193': ['s72', 1], 'rsqrt_64': ['s72', 1], 'x_353': ['s72', 4096], 'x_354': ['s72', 4096], 'x_355': ['s72', 4096]}", "has_guarded_code": true, "remote_cache_time_saved_s": null, "structured_logging_overhead_s": 0.43416, "config_suppress_errors": false, "config_inline_inbuilt_nn_modules": true, "specialize_float": false, "dynamo_config": "{\"_autograd_backward_strict_mode_conditional_banned_ops\": [\"stride\", \"storage_offset\", \"is_contiguous\"], \"_unsafe_skip_fsdp_module_guards\": false, \"accumulated_recompile_limit\": 256, \"allow_empty_graphs\": false, \"allow_ignore_mark_dynamic\": false, \"allow_rnn\": false, \"allow_unspec_int_on_nn_module\": false, \"allowed_functions_module_string_ignorelist\": [\"torch._decomp\", \"torch._prims\", \"torch._refs\", \"torch.distributions\", \"torch.testing\"], \"assume_dunder_attributes_remain_unchanged\": true, \"assume_static_by_default\": true, \"automatic_dynamic_local_pgo\": true, \"automatic_dynamic_remote_pgo\": null, \"automatic_dynamic_shapes\": false, \"automatic_dynamic_shapes_mark_as\": \"dynamic\", \"caching_precompile\": false, \"capture_autograd_function\": true, \"capture_dynamic_output_shape_ops\": false, \"capture_func_transforms\": true, \"capture_scalar_outputs\": false, \"capture_sparse_compute\": true, \"compiled_autograd\": false, \"compiled_autograd_kwargs_override\": {}, \"constant_fold_autograd_profiler_enabled\": false, \"cprofile\": false, \"cudagraph_backend_keep_input_mutation\": false, \"cudagraph_backend_support_input_mutation\": false, \"dead_code_elimination\": true, \"debug_disable_compile_counter\": false, \"debug_force_graph_break_on_leaf_return\": false, \"debug_force_nested_calls\": false, \"disable\": false, \"do_not_emit_runtime_asserts\": false, \"dont_skip_tracing\": false, \"dynamic_shapes\": true, \"enable_aot_compile\": false, \"enable_compiler_collectives\": false, \"enable_cpp_framelocals_guard_eval\": true, \"enable_cpp_guard_manager\": true, \"enable_cpp_symbolic_shape_guards\": false, \"enable_faithful_generator_behavior\": true, \"enable_trace_contextlib\": true, \"enable_trace_unittest\": false, \"enrich_profiler_metadata\": false, \"error_on_nested_fx_trace\": true, \"error_on_nested_jit_trace\": true, \"error_on_recompile\": false, \"fail_on_recompile_limit_hit\": false, \"fake_tensor_cache_crosscheck_enabled\": false, \"fake_tensor_cache_enabled\": true, \"fake_tensor_disable_inference_mode\": true, \"force_nn_module_property_static_shapes\": true, \"force_parameter_static_shapes\": true, \"force_unspec_int_unbacked_size_like_on_torchrec_kjt\": false, \"graph_break_on_nn_param_ctor\": true, \"graph_deduplication_lint\": false, \"guard_nn_modules\": true, \"guard_nn_modules_using_dict_tags\": true, \"inline_inbuilt_nn_modules\": true, \"install_free_tensors\": false, \"install_free_tensors_for_export\": true, \"issue_3_13_0_warning\": true, \"log_graph_in_out_metadata\": false, \"max_saved_pointers_for_recursive_dict_tags_check\": 256, \"minimum_call_count\": 1, \"nested_graph_breaks\": false, \"numpy_default_complex\": \"complex128\", \"numpy_default_float\": \"float64\", \"numpy_default_int\": \"int64\", \"only_allow_pt2_compliant_ops\": false, \"optimize_ddp\": true, \"optimize_ddp_lazy_compile\": false, \"prefer_deferred_runtime_asserts_over_guards\": false, \"prepare_freezing\": false, \"pt2_compile_id_prefix\": null, \"raise_on_ctx_manager_usage\": true, \"raise_on_unsafe_aot_autograd\": false, \"recompile_limit\": 16, \"record_compile_time_instruction_count\": false, \"record_runtime_overhead\": true, \"replay_record_enabled\": false, \"replay_side_effects\": true, \"report_guard_failures\": true, \"rewrite_assert_with_torch_assert\": true, \"run_gc_after_compile\": true, \"side_effect_replay_policy\": \"silent\", \"skip_code_recursive_on_recompile_limit_hit\": true, \"skip_fsdp_guards\": true, \"skip_fsdp_hooks\": true, \"skip_fwd_side_effects_in_bwd_under_checkpoint\": false, \"skip_guards_on_constant_func_defaults\": true, \"skip_nnmodule_hook_guards\": true, \"skip_no_tensor_aliasing_guards_on_parameters\": true, \"skip_tensor_guards_with_matching_dict_tags\": true, \"skip_torchrec\": true, \"skipfiles_inline_module_allowlist\": {}, \"specialize_float\": false, \"specialize_int\": false, \"strict_precompile\": false, \"suppress_errors\": false, \"trace_numpy\": true, \"track_nodes_for_deduplication\": false, \"use_graph_deduplication\": false, \"use_lamba_guard_for_object_aliasing\": true, \"use_lazy_graph_module\": true, \"use_numpy_random_stream\": false, \"use_recursive_dict_tags_for_guards\": true, \"verify_correctness\": false, \"wrap_top_frame\": false}", "compiler_config": "{\"cache_key_tag\": \"\", \"dynamic_sources\": \"\", \"force_cudagraph_gc\": false, \"force_disable_caches\": false, \"job_id\": null, \"pgo_extra_read_key\": null, \"pgo_extra_write_key\": null, \"unbacked_sources\": \"\"}", "is_forward": true, "num_triton_bundles": null, "remote_fx_graph_cache_get_time_ms": null, "remote_fx_graph_cache_put_time_ms": null, "start_time_us": 1769563061181114, "duration_us": 6111051, "dynamo_cumulative_compile_time_us": 6111051, "aot_autograd_cumulative_compile_time_us": 1818307, "inductor_cumulative_compile_time_us": null, "inductor_code_gen_cumulative_compile_time_us": null, "triton_compile_time_us": null, "runtime_cudagraphify_time_us": null, "runtime_triton_autotune_time_us": null, "dynamo_compile_time_before_restart_us": 0, "distributed_ephemeral_timeout_us": null, "structured_logging_overhead_us": 434160, "remote_fx_graph_cache_get_time_us": null, "remote_fx_graph_cache_put_time_us": null, "backward_cumulative_compile_time_us": null, "end_time_us": 1769563067292696, "pre_grad_pass_time_us": null, "post_grad_pass_time_us": null, "joint_graph_pass_time_us": null, "log_format_version": 3, "inductor_config": "{\"TYPE_CHECKING\": false, \"_cache_config_ignore_prefix\": [\"trace\", \"cuda.cutlass_dir\", \"worker_start_method\", \"compile_threads\", \"post_grad_custom_post_pass\", \"post_grad_custom_pre_pass\", \"joint_custom_pre_pass\", \"joint_custom_post_pass\", \"_fuse_ddp_communication_passes\", \"_pre_fusion_custom_pass\", \"always_complex_memory_overlap_TESTING_ONLY\", \"fx_graph_cache\", \"fx_graph_remote_cache\", \"autotune_local_cache\", \"autotune_remote_cache\"], \"_collective.auto_select\": false, \"_collective.one_shot_all_reduce_threshold_bytes\": 131072, \"_debug_cpu_to_tpu_pallas\": false, \"_fuse_ddp_bucket_size\": 25, \"_fuse_ddp_communication\": false, \"_fuse_ddp_communication_passes\": [\"fuse_ddp_with_concat_op\", \"schedule_comm_wait\"], \"_micro_pipeline_tp\": false, \"_post_fusion_custom_pass\": null, \"_pre_fusion_custom_pass\": null, \"_profile_var\": \"\", \"_raise_error_for_testing\": false, \"_save_config_ignore\": [\"trace.upload_tar\", \"joint_custom_pre_pass\", \"joint_custom_post_pass\", \"pre_grad_custom_pass\", \"aot_inductor.repro_level\", \"aot_inductor.dump_aoti_minifier\", \"post_grad_custom_pre_pass\", \"post_grad_custom_post_pass\", \"_fuse_ddp_communication_passes\", \"_pre_fusion_custom_pass\"], \"add_pre_grad_passes\": null, \"aggressive_fusion\": false, \"alignment_asserts\": true, \"allow_buffer_reuse\": true, \"always_complex_memory_overlap_TESTING_ONLY\": false, \"always_keep_tensor_constants\": false, \"annotate_training\": false, \"aot_inductor.allow_stack_allocation\": false, \"aot_inductor.aoti_shim_library\": null, \"aot_inductor.aoti_shim_library_path\": null, \"aot_inductor.check_lowerbound\": true, \"aot_inductor.compile_wrapper_opt_level\": \"O1\", \"aot_inductor.cross_target_platform\": null, \"aot_inductor.custom_op_libs\": null, \"aot_inductor.custom_ops_to_c_shims\": {}, \"aot_inductor.debug_compile\": false, \"aot_inductor.debug_intermediate_value_printer\": \"0\", \"aot_inductor.debug_symbols\": false, \"aot_inductor.dump_aoti_minifier\": false, \"aot_inductor.dynamic_linkage\": true, \"aot_inductor.embed_kernel_binary\": null, \"aot_inductor.emit_multi_arch_kernel\": null, \"aot_inductor.enable_lto\": false, \"aot_inductor.filtered_kernel_names\": null, \"aot_inductor.force_mmap_weights\": false, \"aot_inductor.link_libtorch\": true, \"aot_inductor.metadata\": {}, \"aot_inductor.model_name_for_generated_files\": null, \"aot_inductor.output_path\": \"\", \"aot_inductor.package\": false, \"aot_inductor.package_constants_in_so\": true, \"aot_inductor.package_constants_on_disk_format\": null, \"aot_inductor.package_cpp_only\": null, \"aot_inductor.precompile_headers\": true, \"aot_inductor.presets\": {}, \"aot_inductor.raise_error_on_ignored_optimization\": true, \"aot_inductor.repro_level\": 2, \"aot_inductor.serialized_in_spec\": \"\", \"aot_inductor.serialized_out_spec\": \"\", \"aot_inductor.use_consts_asm_build\": true, \"aot_inductor.use_minimal_arrayref_interface\": false, \"aot_inductor.use_runtime_constant_folding\": false, \"aot_inductor.weight_use_caching_allocator\": false, \"aot_inductor_mode.compile_standalone\": false, \"assert_indirect_indexing\": true, \"assume_32bit_indexing\": true, \"assume_aligned_inputs\": false, \"assume_unaligned_fallback_output\": false, \"aten_distributed_optimizations.collective_bucketing\": null, \"aten_distributed_optimizations.collective_estimator\": \"analytical\", \"aten_distributed_optimizations.compute_overlap_multipler\": null, \"aten_distributed_optimizations.custom_runtime_estimation\": null, \"aten_distributed_optimizations.enable_overlap_scheduling\": false, \"aten_distributed_optimizations.insert_overlap_deps\": null, \"aten_distributed_optimizations.max_coll_distance\": null, \"aten_distributed_optimizations.max_compute_pre_fetch\": null, \"aten_distributed_optimizations.max_in_flight_gb\": null, \"aten_distributed_optimizations.max_memory_increase_gb\": null, \"aten_distributed_optimizations.max_memory_increase_ratio\": null, \"autoheuristic_collect\": \"\", \"autoheuristic_log_path\": \"DEFAULT\", \"autoheuristic_use\": \"mixed_mm\", \"autotune_fallback_to_aten\": false, \"autotune_in_subproc\": false, \"autotune_local_cache\": true, \"autotune_lookup_table\": {}, \"autotune_multi_device\": false, \"autotune_num_choices_displayed\": 10, \"autotune_remote_cache\": null, \"b2b_gemm_pass\": false, \"batch_fusion\": true, \"benchmark_combo_kernel\": false, \"benchmark_epilogue_fusion\": true, \"benchmark_fusion\": false, \"benchmark_harness\": true, \"benchmark_kernel\": false, \"bucket_all_gathers_fx\": \"none\", \"bucket_all_gathers_fx_bucket_size_determinator\": null, \"bucket_all_reduces_fx\": \"none\", \"bucket_all_reduces_fx_bucket_size_determinator\": null, \"bucket_reduce_scatters_fx\": \"none\", \"bucket_reduce_scatters_fx_bucket_size_determinator\": null, \"bundle_triton_into_fx_graph_cache\": true, \"bundled_autotune_remote_cache\": null, \"bw_outputs_user_visible\": true, \"can_inplace_pad_graph_input\": false, \"check_stack_no_cycles_TESTING_ONLY\": false, \"collective_benchmark_nruns\": 50, \"collective_benchmark_timeout\": 30.0, \"combo_kernel_allow_mixed_sizes\": 1, \"combo_kernel_foreach_dynamic_shapes\": true, \"combo_kernel_max_num_args\": 250, \"combo_kernels\": false, \"combo_kernels_autotune\": 1, \"comment_origin\": false, \"compile_threads\": 1, \"comprehensive_padding\": true, \"compute_all_bounds\": false, \"constant_and_index_propagation\": true, \"conv_1x1_as_mm\": false, \"coordinate_descent_check_all_directions\": false, \"coordinate_descent_search_radius\": 1, \"coordinate_descent_tuning\": false, \"cpp.cxx\": [null, \"g++\"], \"cpp.descriptive_names\": \"original_aten\", \"cpp.dynamic_threads\": false, \"cpp.enable_concat_linear\": false, \"cpp.enable_floating_point_contract_flag\": \"off\", \"cpp.enable_grouped_gemm_template\": false, \"cpp.enable_kernel_profile\": false, \"cpp.enable_loop_tail_vec\": true, \"cpp.enable_tiling_heuristics\": true, \"cpp.enable_unsafe_math_opt_flag\": false, \"cpp.fallback_scatter_reduce_sum\": true, \"cpp.force_inline_kernel\": false, \"cpp.gemm_cache_blocking\": null, \"cpp.gemm_max_k_slices\": 1, \"cpp.gemm_thread_factors\": null, \"cpp.inject_log1p_bug_TESTING_ONLY\": null, \"cpp.inject_relu_bug_TESTING_ONLY\": null, \"cpp.max_horizontal_fusion_size\": 16, \"cpp.min_chunk_size\": 512, \"cpp.no_redundant_loops\": true, \"cpp.simdlen\": null, \"cpp.threads\": -1, \"cpp.use_constexpr_for_int_array\": true, \"cpp.use_decompose_tanh\": false, \"cpp.use_small_dequant_buffer\": false, \"cpp.vec_isa_ok\": null, \"cpp.weight_prepack\": true, \"cpp_cache_precompile_headers\": true, \"cpp_wrapper\": false, \"cpp_wrapper_build_separate\": false, \"cpu_backend\": \"cpp\", \"cpu_gpu_bw\": 50.0, \"cuda.arch\": null, \"cuda.binary_remote_cache_force_write\": false, \"cuda.compile_opt_level\": \"-O1\", \"cuda.cuda_cxx\": null, \"cuda.cutlass_backend_min_gemm_size\": 1, \"cuda.cutlass_dir\": \"/home/angelayi/.conda/envs/vllm-2.10/lib/python3.12/site-packages/third_party/cutlass\", \"cuda.cutlass_enabled_ops\": \"all\", \"cuda.cutlass_epilogue_fusion_enabled\": false, \"cuda.cutlass_hash_with_compile_cmd\": false, \"cuda.cutlass_instantiation_level\": \"0\", \"cuda.cutlass_max_profiling_configs\": null, \"cuda.cutlass_max_profiling_swizzle_options\": [1, 2, 4, 8], \"cuda.cutlass_op_allowlist_regex\": null, \"cuda.cutlass_op_denylist_regex\": null, \"cuda.cutlass_prescreening\": true, \"cuda.cutlass_tma_only\": false, \"cuda.enable_caching_codegen\": true, \"cuda.enable_cuda_lto\": false, \"cuda.enable_debug_info\": false, \"cuda.enable_ptxas_info\": false, \"cuda.generate_test_runner\": false, \"cuda.upload_to_binary_remote_cache\": false, \"cuda.use_binary_remote_cache\": true, \"cuda.use_fast_math\": false, \"cuda.version\": null, \"cuda_backend\": \"triton\", \"custom_partitioner_fn\": null, \"custom_should_partition_ops\": [], \"cutedsl_enable_autotuning\": false, \"dce\": false, \"debug\": false, \"debug_fusion\": false, \"debug_index_asserts\": false, \"debug_ir_traceback\": false, \"decompose_mem_bound_mm\": false, \"deterministic\": false, \"developer_warnings\": false, \"disable_cpp_codegen\": false, \"disable_padding_cpu\": true, \"disable_progress\": true, \"distributed_max_autotune_gemm\": false, \"dynamic_scale_rblock\": true, \"efficient_conv_bn_eval_fx_passes\": false, \"emulate_divison_rounding\": false, \"emulate_precision_casts\": false, \"enable_auto_functionalized_v2\": true, \"enable_autograd_for_aot\": false, \"enable_caching_generated_triton_templates\": true, \"enable_linear_binary_folding\": false, \"enabled_metric_tables\": \"\", \"epilogue_fusion\": true, \"epilogue_fusion_first\": false, \"estimate_op_runtime\": \"default\", \"expand_dimension_for_pointwise_nodes\": false, \"external_matmul\": [], \"fallback_by_default\": false, \"fallback_embedding_bag_byte_unpack\": false, \"fallback_random\": false, \"file_lock_timeout\": 600, \"force_fuse_int_mm_with_mul\": false, \"force_layout_optimization\": false, \"force_pointwise_cat\": false, \"force_same_precision\": false, \"force_shape_pad\": false, \"freezing\": false, \"freezing_discard_parameters\": false, \"fx_graph_cache\": true, \"fx_graph_remote_cache\": null, \"fx_passes_numeric_check\": {\"num_iterations\": 1, \"pre_grad\": false, \"precision\": 0.0001, \"requires_optimizer\": true}, \"fx_wrapper\": false, \"generate_intermediate_hooks\": false, \"global_cache_dir\": null, \"graph_partition\": true, \"group_fusion\": false, \"halide.asserts\": false, \"halide.cpu_target\": \"host\", \"halide.debug\": false, \"halide.gpu_target\": \"host-cuda\", \"halide.scan_kernels\": false, \"halide.scheduler_cpu\": \"Adams2019\", \"halide.scheduler_cuda\": \"Anderson2021\", \"implicit_fallbacks\": true, \"inductor_choices_class\": null, \"inplace_buffers\": true, \"inplace_padding\": true, \"inter_node_bw\": 25, \"intra_node_bw\": 300, \"is_nightly_or_source\": false, \"is_predispatch\": false, \"joint_custom_post_pass\": null, \"joint_custom_pre_pass\": null, \"joint_graph_constant_folding\": true, \"keep_output_stride\": true, \"kernel_name_max_ops\": 10, \"layout_opt_default\": \"1\", \"layout_optimization\": true, \"log_tlparse\": false, \"lookup_table.check_src_hash\": true, \"lookup_table.table\": null, \"loop_index_inversion_in_fusion\": true, \"loop_ordering_after_fusion\": true, \"max_autotune\": false, \"max_autotune_allow_flexible_layouts\": false, \"max_autotune_conv_backends\": \"ATEN,TRITON\", \"max_autotune_flex_search_space\": \"DEFAULT\", \"max_autotune_gemm\": false, \"max_autotune_gemm_backends\": \"ATEN,TRITON,CPP\", \"max_autotune_gemm_search_space\": \"DEFAULT\", \"max_autotune_pointwise\": false, \"max_autotune_prune_choices_based_on_shared_mem\": true, \"max_autotune_report_choices_stats\": true, \"max_autotune_subproc_graceful_timeout_seconds\": 0.0, \"max_autotune_subproc_result_timeout_seconds\": 60.0, \"max_autotune_subproc_terminate_timeout_seconds\": 0.0, \"max_epilogue_benchmarked_choices\": 1, \"max_fusion_buffer_group_pairwise_attempts\": 64, \"max_fusion_size\": 64, \"max_fusion_unique_io_buffers\": null, \"max_pointwise_cat_inputs\": 8, \"memory_planning\": false, \"memory_pool\": \"intermediates\", \"min_num_split\": 0, \"mixed_mm_choice\": \"heuristic\", \"multi_kernel_hints\": [], \"nan_asserts\": false, \"non_blocking_remote_cache_write\": true, \"online_softmax\": true, \"optimize_scatter_upon_const_tensor\": true, \"pad_channels_last\": false, \"pad_dynamic_shapes\": false, \"pad_outputs\": false, \"padding_alignment_bytes\": 128, \"padding_stride_threshold\": 1024, \"pallas_take_first_jax_device_only\": true, \"pattern_matcher\": true, \"permute_fusion\": false, \"pick_loop_orders\": true, \"post_grad_custom_post_pass\": null, \"post_grad_custom_pre_pass\": null, \"post_grad_fusion_options\": {}, \"pre_grad_custom_pass\": null, \"pre_grad_fusion_options\": {}, \"precompilation_timeout_seconds\": 3600, \"profile_bandwidth\": false, \"profile_bandwidth_output\": null, \"profile_bandwidth_regex\": \"\", \"profile_bandwidth_with_do_bench_using_profiling\": false, \"profiler_mark_wrapper_call\": false, \"prologue_fusion\": true, \"quiesce_async_compile_pool\": true, \"quiesce_async_compile_time\": 60, \"realize_acc_reads_size_threshold\": null, \"realize_acc_reads_threshold\": 8, \"realize_opcount_threshold\": 30, \"realize_reads_threshold\": 4, \"remote_gemm_autotune_cache\": false, \"remove_pre_grad_passes\": null, \"reorder_for_compute_comm_overlap\": false, \"reorder_for_compute_comm_overlap_passes\": [], \"reorder_for_locality\": true, \"reorder_for_peak_memory\": true, \"reorder_for_peak_memory_debug\": false, \"reorder_prefetch_limit\": null, \"rocm.arch\": [], \"rocm.ck_dir\": null, \"rocm.ck_max_profiling_configs\": null, \"rocm.ck_supported_arch\": [\"gfx90a\", \"gfx942\", \"gfx950\"], \"rocm.ck_tile_max_profiling_configs\": null, \"rocm.compile_opt_level\": \"-O2\", \"rocm.contiguous_threshold\": 16, \"rocm.flush_denormals\": true, \"rocm.generate_test_runner\": false, \"rocm.is_debug\": false, \"rocm.kBatch_sweep\": null, \"rocm.n_max_profiling_configs\": null, \"rocm.print_kernel_resource_usage\": false, \"rocm.rocm_home\": null, \"rocm.save_temps\": false, \"rocm.split_k_threshold\": 16, \"rocm.use_fast_math\": true, \"rocm.use_preselected_instances\": false, \"run_jit_post_compile_hook\": false, \"runtime_estimations_mms_benchmark\": false, \"runtime_triton_nan_asserts\": false, \"save_args\": false, \"scalar_asserts\": true, \"score_fusion_memory_threshold\": 10, \"search_autotune_cache\": false, \"selective_decompose\": false, \"shape_padding\": true, \"size_asserts\": true, \"size_threshold_for_succ_based_strategy\": 0, \"sleep_sec_TESTING_ONLY\": null, \"small_memory_access_threshold\": 16777216, \"split_cat_fx_passes\": true, \"split_reductions\": true, \"static_launch_user_defined_triton_kernels\": false, \"static_weight_shapes\": true, \"strict_static_cuda_launcher\": false, \"test_configs.assume_bucketing_reduces_latency\": true, \"test_configs.autotune_choice_desc_regex\": null, \"test_configs.autotune_choice_name_regex\": null, \"test_configs.bisect_keep_custom_backend_for_inductor\": false, \"test_configs.bisect_pre_grad_graph\": false, \"test_configs.distort_benchmarking_result\": \"\", \"test_configs.force_extern_kernel_in_multi_template\": false, \"test_configs.force_filter_reduction_configs\": false, \"test_configs.graphsafe_rng_func_ignores_fallback_random\": false, \"test_configs.max_mm_configs\": null, \"test_configs.runtime_triton_dtype_assert\": false, \"test_configs.runtime_triton_shape_assert\": false, \"test_configs.static_cpp_dtype_assert\": false, \"test_configs.track_memory_lifecycle\": null, \"test_configs.use_libtorch\": false, \"torchinductor_worker_logpath\": \"\", \"trace.compile_profile\": false, \"trace.debug_dir\": null, \"trace.debug_log\": false, \"trace.dot_graph_shape\": null, \"trace.draw_orig_fx_graph\": false, \"trace.enabled\": false, \"trace.fx_graph\": true, \"trace.fx_graph_transformed\": true, \"trace.graph_diagram\": false, \"trace.info_log\": false, \"trace.ir_post_fusion\": true, \"trace.ir_pre_fusion\": true, \"trace.log_autotuning_results\": false, \"trace.log_url_for_graph_xform\": null, \"trace.output_code\": true, \"trace.provenance_tracking_level\": 0, \"trace.save_real_tensors\": false, \"trace.upload_tar\": null, \"triton.autotune_at_compile_time\": null, \"triton.autotune_cublasLt\": true, \"triton.autotune_pointwise\": true, \"triton.autotune_with_sample_inputs\": false, \"triton.coalesce_tiling_analysis\": true, \"triton.codegen_upcast_to_fp32\": true, \"triton.cooperative_reductions\": false, \"triton.cudagraph_capture_sizes\": null, \"triton.cudagraph_dynamic_shape_warn_limit\": 8, \"triton.cudagraph_or_error\": false, \"triton.cudagraph_skip_dynamic_graphs\": false, \"triton.cudagraph_support_input_mutation\": true, \"triton.cudagraph_trees\": true, \"triton.cudagraph_trees_history_recording\": false, \"triton.cudagraph_unexpected_rerecord_limit\": 128, \"triton.cudagraphs\": false, \"triton.debug_sync_graph\": false, \"triton.debug_sync_kernel\": false, \"triton.decompose_k_threshold\": 32, \"triton.dense_indexing\": false, \"triton.descriptive_names\": \"original_aten\", \"triton.disallow_failing_autotune_kernels_TESTING_ONLY\": false, \"triton.divisible_by_16\": true, \"triton.enable_epilogue_subtiling\": true, \"triton.enable_pdl\": false, \"triton.enable_persistent_tma_matmul\": false, \"triton.enable_template_tma_store\": false, \"triton.fast_path_cudagraph_asserts\": false, \"triton.force_cooperative_reductions\": false, \"triton.force_cudagraph_sync\": false, \"triton.force_cudagraphs_warmup\": false, \"triton.inject_relu_bug_TESTING_ONLY\": null, \"triton.max_tiles\": null, \"triton.min_split_scan_rblock\": 256, \"triton.mix_order_reduction\": true, \"triton.mix_order_reduction_autotune_split_size\": false, \"triton.mix_order_reduction_initial_xblock\": 1, \"triton.mix_order_reduction_split_size\": null, \"triton.multi_kernel\": 0, \"triton.native_matmul\": false, \"triton.num_decompose_k_splits\": 10, \"triton.persistent_reductions\": true, \"triton.prefer_nd_tiling\": false, \"triton.reorder_for_reducing_graph_partitions\": true, \"triton.skip_cudagraph_warmup\": false, \"triton.skip_l1_cache\": false, \"triton.slow_path_cudagraph_asserts\": true, \"triton.spill_threshold\": 16, \"triton.store_cubin\": false, \"triton.tile_reductions\": false, \"triton.tiling_prevents_pointwise_fusion\": true, \"triton.tiling_prevents_reduction_fusion\": true, \"triton.transpose_discontiguous_tensor_descriptor\": true, \"triton.unique_kernel_names\": true, \"triton.unique_user_kernel_names\": false, \"triton.use_block_ptr\": false, \"triton.use_tensor_descriptor\": false, \"triton_disable_device_detection\": false, \"triton_kernel_default_layout_constraint\": \"needs_fixed_stride_order\", \"unbacked_symint_fallback\": 8192, \"unroll_reductions_threshold\": 8, \"unsafe_ignore_unsupported_triton_autotune_args\": false, \"unsafe_marked_cacheable_functions\": {}, \"unsafe_skip_cache_dynamic_shape_guards\": false, \"use_dce\": true, \"use_experimental_benchmarker\": true, \"use_fast_math\": false, \"use_joint_graph_passes\": true, \"use_mixed_mm\": true, \"use_post_grad_passes\": true, \"use_pre_grad_passes\": true, \"use_static_cuda_launcher\": true, \"verbose_progress\": false, \"warn_mix_layout\": false, \"worker_log_path\": null, \"worker_start_method\": \"subprocess\", \"worker_suppress_logging\": true, \"wrap_inductor_compiled_regions\": false, \"write_are_deterministic_algorithms_enabled\": true, \"xpu_backend\": \"triton\"}", "remote_cache_version": null, "inductor_fx_remote_cache_hit_count": null, "inductor_fx_remote_cache_miss_count": null, "inductor_fx_remote_cache_backend_type": null, "inductor_fx_remote_cache_hit_keys": null, "inductor_fx_remote_cache_miss_keys": null, "cuda_version": "12.8", "triton_version": "3.6.0", "feature_usage": {"dynamo.automatic_dynamic_shapes": false}, "compile_time_autotune_time_us": null, "is_runtime": false, "gc_time_us": 430, "tensorify_float_attempt": null, "tensorify_float_success": null, "tensorify_float_failure": null, "guard_latency_us": 6, "recompile_reason": null, "num_graph_breaks": null, "triton_kernel_compile_times_us": null, "ir_count": 25245, "cudagraph_skip_reason": null, "python_version": "3.12.0 | packaged by Anaconda, Inc. | (main, Oct 2 2023, 17:29:18) [GCC 11.2.0]", "pgo_put_remote_code_state_time_us": null, "pgo_get_remote_code_state_time_us": null, "param_numel": 3752595456, "param_bytes": 7505190912, "param_count": 194, "recompile_user_contexts": null, "inline_inbuilt_nn_modules_candidate": false, "pytorch_version": "2.10.0+cu128", "inductor_provenance": null}, "rank": 0, "frame_id": 0, "frame_compile_id": 0, "attempt": 0} +V0127 17:17:47.308000 1175001 /data/users/angelayi/vllm/vllm/compilation/piecewise_backend.py:142] {"artifact": {"name": "vllm_piecewise_compile_start", "encoding": "json"}, "rank": 0, "stack": [{"line": 1, "name": "", "filename": 1, "loc": ""}, {"line": 122, "name": "spawn_main", "filename": 2, "loc": "exitcode = _main(fd, parent_sentinel)"}, {"line": 135, "name": "_main", "filename": 2, "loc": "return self._bootstrap(parent_sentinel)"}, {"line": 314, "name": "_bootstrap", "filename": 3, "loc": "self.run()"}, {"line": 108, "name": "run", "filename": 3, "loc": "self._target(*self._args, **self._kwargs)"}, {"line": 742, "name": "worker_main", "filename": 4, "loc": "worker.worker_busy_loop(cancel=shutdown_event)"}, {"line": 819, "name": "worker_busy_loop", "filename": 4, "loc": "output = func(*args, **kwargs)"}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 340, "name": "determine_available_memory", "filename": 6, "loc": "self.model_runner.profile_run()"}, {"line": 4516, "name": "profile_run", "filename": 7, "loc": "hidden_states, last_hidden_states = self._dummy_run("}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 4217, "name": "_dummy_run", "filename": 7, "loc": "outputs = self.model("}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 623, "name": "forward", "filename": 10, "loc": "model_output = self.model("}, {"line": 526, "name": "__call__", "filename": 11, "loc": "output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)"}, {"line": 218, "name": "__call__", "filename": 12, "loc": "return self._call_with_optional_nvtx_range("}, {"line": 109, "name": "_call_with_optional_nvtx_range", "filename": 12, "loc": "return callable_fn(*args, **kwargs)"}, {"line": 953, "name": "compile_wrapper", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 412, "name": "forward", "filename": 10, "loc": "def forward("}, {"line": 1181, "name": "_fn", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 54, "name": "__call__", "filename": 44, "loc": "return self.optimized_call(*args, **kwargs)"}, {"line": 936, "name": "call_wrapped", "filename": 45, "loc": "return self._wrapped_call(self, *args, **kwargs)"}, {"line": 442, "name": "__call__", "filename": 45, "loc": "return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 202, "name": "forward", "filename": 46, "loc": "submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None"}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 222, "name": "__call__", "filename": 47, "loc": "self._maybe_compile_for_range_entry(range_entry, args)"}, {"line": 178, "name": "_maybe_compile_for_range_entry", "filename": 47, "loc": "self._log_compile_start(range_entry.compile_range)"}, {"line": 142, "name": "_log_compile_start", "filename": 47, "loc": "trace_structured("}], "has_payload": "a9945d34d1b334679b330d57acffcd5a"} + {"piecewise_index": 0, "submod_name": "submod_0", "total_piecewise_compiles": 33, "compile_range_start": 1, "compile_range_end": 16384, "is_single_size": false, "is_cudagraph_capture_size": false} +V0127 17:17:47.313000 1175001 /data/users/angelayi/vllm/vllm/compilation/piecewise_backend.py:165] {"graph_dump": {"name": "vllm_submod_0"}, "rank": 0, "stack": [{"line": 1, "name": "", "filename": 1, "loc": ""}, {"line": 122, "name": "spawn_main", "filename": 2, "loc": "exitcode = _main(fd, parent_sentinel)"}, {"line": 135, "name": "_main", "filename": 2, "loc": "return self._bootstrap(parent_sentinel)"}, {"line": 314, "name": "_bootstrap", "filename": 3, "loc": "self.run()"}, {"line": 108, "name": "run", "filename": 3, "loc": "self._target(*self._args, **self._kwargs)"}, {"line": 742, "name": "worker_main", "filename": 4, "loc": "worker.worker_busy_loop(cancel=shutdown_event)"}, {"line": 819, "name": "worker_busy_loop", "filename": 4, "loc": "output = func(*args, **kwargs)"}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 340, "name": "determine_available_memory", "filename": 6, "loc": "self.model_runner.profile_run()"}, {"line": 4516, "name": "profile_run", "filename": 7, "loc": "hidden_states, last_hidden_states = self._dummy_run("}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 4217, "name": "_dummy_run", "filename": 7, "loc": "outputs = self.model("}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 623, "name": "forward", "filename": 10, "loc": "model_output = self.model("}, {"line": 526, "name": "__call__", "filename": 11, "loc": "output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)"}, {"line": 218, "name": "__call__", "filename": 12, "loc": "return self._call_with_optional_nvtx_range("}, {"line": 109, "name": "_call_with_optional_nvtx_range", "filename": 12, "loc": "return callable_fn(*args, **kwargs)"}, {"line": 953, "name": "compile_wrapper", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 412, "name": "forward", "filename": 10, "loc": "def forward("}, {"line": 1181, "name": "_fn", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 54, "name": "__call__", "filename": 44, "loc": "return self.optimized_call(*args, **kwargs)"}, {"line": 936, "name": "call_wrapped", "filename": 45, "loc": "return self._wrapped_call(self, *args, **kwargs)"}, {"line": 442, "name": "__call__", "filename": 45, "loc": "return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 202, "name": "forward", "filename": 46, "loc": "submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None"}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 222, "name": "__call__", "filename": 47, "loc": "self._maybe_compile_for_range_entry(range_entry, args)"}, {"line": 178, "name": "_maybe_compile_for_range_entry", "filename": 47, "loc": "self._log_compile_start(range_entry.compile_range)"}, {"line": 165, "name": "_log_compile_start", "filename": 47, "loc": "trace_structured("}], "has_payload": "c8eb8c4aaf9956c23ecb427e6cd26d38"} + class GraphModule(torch.nn.Module): + def forward(self, l_input_ids_: "i32[s72]", s72: "Sym(s72)", l_self_modules_embed_tokens_parameters_weight_: "bf16[64128, 4096]", l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:167 in get_masked_input_and_mask, code: org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) + ge: "b8[s72]" = l_input_ids_ >= 0 + lt: "b8[s72]" = l_input_ids_ < 64128 + and_: "b8[s72]" = ge & lt; ge = lt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:168 in get_masked_input_and_mask, code: added_vocab_mask = (input_ >= added_vocab_start_index) & ( + ge_1: "b8[s72]" = l_input_ids_ >= 128256 + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:169 in get_masked_input_and_mask, code: input_ < added_vocab_end_index + lt_1: "b8[s72]" = l_input_ids_ < 128256 + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:168 in get_masked_input_and_mask, code: added_vocab_mask = (input_ >= added_vocab_start_index) & ( + and__1: "b8[s72]" = ge_1 & lt_1; ge_1 = lt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:176 in get_masked_input_and_mask, code: valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + mul: "i64[s72]" = 0 * and_ + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:177 in get_masked_input_and_mask, code: added_offset * added_vocab_mask + mul_1: "i64[s72]" = 64128 * and__1 + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:176 in get_masked_input_and_mask, code: valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + add: "i64[s72]" = mul + mul_1; mul = mul_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:179 in get_masked_input_and_mask, code: vocab_mask = org_vocab_mask | added_vocab_mask + or_: "b8[s72]" = and_ | and__1; and_ = and__1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:180 in get_masked_input_and_mask, code: input_ = vocab_mask * (input_ - valid_offset) + sub: "i64[s72]" = l_input_ids_ - add; l_input_ids_ = add = None + mul_2: "i64[s72]" = or_ * sub; sub = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:181 in get_masked_input_and_mask, code: return input_, ~vocab_mask + invert: "b8[s72]" = ~or_; or_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:475 in forward_native, code: output_parallel = self.quant_method.embedding(self, masked_input.long()) + long: "i64[s72]" = mul_2.long(); mul_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:72 in embedding, code: return F.embedding(input_, layer.weight) + embedding: "bf16[s72, 4096]" = torch.nn.functional.embedding(long, l_self_modules_embed_tokens_parameters_weight_); long = l_self_modules_embed_tokens_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:478 in forward_native, code: output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) + unsqueeze: "b8[s72, 1]" = invert.unsqueeze(-1); invert = None + masked_fill_: "bf16[s72, 4096]" = embedding.masked_fill_(unsqueeze, 0); unsqueeze = masked_fill_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(embedding, group_name = 'tp:0'); embedding = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = to.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul_3: "f32[s72, 4096]" = to * rsqrt; to = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_1 * _get_data_attr; to_1 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear.split([2048, 512, 512], dim = -1); linear = None + getitem: "bf16[s72, 2048]" = split[0] + getitem_1: "bf16[s72, 512]" = split[1] + getitem_2: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_3: "bf16[s72, 64]" = chunk[0] + getitem_4: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view: "bf16[s72, 16, 128]" = getitem.view(s72, -1, 128); getitem = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_5: "bf16[s72, 16, 128]" = view[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_6: "bf16[s72, 16, 0]" = view[(Ellipsis, slice(128, None, None))]; view = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_3.unsqueeze(-2) + to_2: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_4.unsqueeze(-2) + to_3: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_5, 2, dim = -1); getitem_5 = None + getitem_7: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_8: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_7 * to_2 + mul_6: "bf16[s72, 16, 64]" = getitem_8 * to_3 + sub_1: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_8 * to_2; getitem_8 = to_2 = None + mul_8: "bf16[s72, 16, 64]" = getitem_7 * to_3; getitem_7 = to_3 = None + add_2: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub_1, add_2), dim = -1); sub_1 = add_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_6), dim = -1); cat = getitem_6 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_1.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 4, 128]" = getitem_1.view(s72, -1, 128); getitem_1 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_9: "bf16[s72, 4, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_10: "bf16[s72, 4, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_3.unsqueeze(-2); getitem_3 = None + to_4: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_4: "bf16[s72, 1, 64]" = getitem_4.unsqueeze(-2); getitem_4 = None + to_5: "bf16[s72, 1, 64]" = unsqueeze_4.to(torch.bfloat16); unsqueeze_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_9, 2, dim = -1); getitem_9 = None + getitem_11: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_12: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_11 * to_4 + mul_10: "bf16[s72, 4, 64]" = getitem_12 * to_5 + sub_2: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_12 * to_4; getitem_12 = to_4 = None + mul_12: "bf16[s72, 4, 64]" = getitem_11 * to_5; getitem_11 = to_5 = None + add_3: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_2, add_3), dim = -1); sub_2 = add_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_10), dim = -1); cat_2 = getitem_10 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_2: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_4: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = getitem_2.view(-1, 4, 128); getitem_2 = None + return (view_2, view_4, view_5, view_3, all_reduce) + +V0127 17:17:47.944000 1175001 site-packages/torch/_inductor/compile_fx.py:1335] {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "rank": 0, "stack": [{"line": 1, "name": "", "filename": 1, "loc": ""}, {"line": 122, "name": "spawn_main", "filename": 2, "loc": "exitcode = _main(fd, parent_sentinel)"}, {"line": 135, "name": "_main", "filename": 2, "loc": "return self._bootstrap(parent_sentinel)"}, {"line": 314, "name": "_bootstrap", "filename": 3, "loc": "self.run()"}, {"line": 108, "name": "run", "filename": 3, "loc": "self._target(*self._args, **self._kwargs)"}, {"line": 742, "name": "worker_main", "filename": 4, "loc": "worker.worker_busy_loop(cancel=shutdown_event)"}, {"line": 819, "name": "worker_busy_loop", "filename": 4, "loc": "output = func(*args, **kwargs)"}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 340, "name": "determine_available_memory", "filename": 6, "loc": "self.model_runner.profile_run()"}, {"line": 4516, "name": "profile_run", "filename": 7, "loc": "hidden_states, last_hidden_states = self._dummy_run("}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 4217, "name": "_dummy_run", "filename": 7, "loc": "outputs = self.model("}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 623, "name": "forward", "filename": 10, "loc": "model_output = self.model("}, {"line": 526, "name": "__call__", "filename": 11, "loc": "output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)"}, {"line": 218, "name": "__call__", "filename": 12, "loc": "return self._call_with_optional_nvtx_range("}, {"line": 109, "name": "_call_with_optional_nvtx_range", "filename": 12, "loc": "return callable_fn(*args, **kwargs)"}, {"line": 953, "name": "compile_wrapper", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 412, "name": "forward", "filename": 10, "loc": "def forward("}, {"line": 1181, "name": "_fn", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 54, "name": "__call__", "filename": 44, "loc": "return self.optimized_call(*args, **kwargs)"}, {"line": 936, "name": "call_wrapped", "filename": 45, "loc": "return self._wrapped_call(self, *args, **kwargs)"}, {"line": 442, "name": "__call__", "filename": 45, "loc": "return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 202, "name": "forward", "filename": 46, "loc": "submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None"}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 222, "name": "__call__", "filename": 47, "loc": "self._maybe_compile_for_range_entry(range_entry, args)"}, {"line": 189, "name": "_maybe_compile_for_range_entry", "filename": 47, "loc": "range_entry.runnable = self.vllm_backend.compiler_manager.compile("}, {"line": 245, "name": "compile", "filename": 43, "loc": "compiled_graph, handle = self.compiler.compile("}, {"line": 233, "name": "compile", "filename": 48, "loc": "compiled_graph = standalone_compile("}, {"line": 445, "name": "standalone_compile", "filename": 49, "loc": "return standalone_compile("}, {"line": 423, "name": "standalone_compile", "filename": 50, "loc": "compiled_fn = compile_fx("}, {"line": 2486, "name": "compile_fx", "filename": 51, "loc": "return compile_fx("}, {"line": 2537, "name": "compile_fx", "filename": 51, "loc": "return _maybe_wrap_and_compile_fx_main("}, {"line": 2614, "name": "_maybe_wrap_and_compile_fx_main", "filename": 51, "loc": "return _compile_fx_main("}, {"line": 2809, "name": "_compile_fx_main", "filename": 51, "loc": "return aot_autograd("}, {"line": 123, "name": "__call__", "filename": 52, "loc": "cg = aot_module_simplified(gm, example_inputs, **self.kwargs)"}, {"line": 1115, "name": "aot_module_simplified", "filename": 53, "loc": "compiled_fn, _ = aot_stage2_compile("}, {"line": 357, "name": "aot_stage2_compile", "filename": 55, "loc": "return aot_stage2_inference(aot_state, aot_graph_capture)"}, {"line": 431, "name": "aot_stage2_inference", "filename": 55, "loc": "compiled_fw = _aot_stage2b_inference_compile("}, {"line": 400, "name": "_aot_stage2b_inference_compile", "filename": 55, "loc": "return _aot_stage2b_compile_forward_or_inference("}, {"line": 2300, "name": "_aot_stage2b_compile_forward_or_inference", "filename": 55, "loc": "compiled_fw_func = compiler(fw_module, adjusted_flat_args)"}, {"line": 1249, "name": "__call__", "filename": 57, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2678, "name": "fw_compiler_base", "filename": 51, "loc": "return compile_fx_forward("}, {"line": 2350, "name": "compile_fx_forward", "filename": 51, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 58, "loc": "return func(*args, **kwds)"}, {"line": 806, "name": "compile_fx_inner", "filename": 51, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 146, "name": "debug_wrapper", "filename": 59, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 1003, "name": "_compile_fx_inner", "filename": 51, "loc": "mb_compiled_graph = fx_codegen_and_compile("}, {"line": 1766, "name": "fx_codegen_and_compile", "filename": 51, "loc": "return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)"}, {"line": 1335, "name": "codegen_and_compile", "filename": 51, "loc": "trace_structured("}], "has_payload": "53d2772977acf8063190f70e6d9648f8"} + class (torch.nn.Module): + def forward(self, arg0_1: "i32[s72][1]cuda:0", arg1_1: "Sym(s72)", arg2_1: "bf16[64128, 4096][4096, 1]cuda:0", arg3_1: "bf16[4096][1]cuda:0", arg4_1: "bf16[3072, 4096][4096, 1]cuda:0", arg5_1: "i64[s72][1]cuda:0", arg6_1: "bf16[131072, 128][128, 1]cuda:0"): + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:167 in get_masked_input_and_mask, code: org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) + ge: "b8[s72][1]cuda:0" = torch.ops.aten.ge.Scalar(arg0_1, 0) + lt: "b8[s72][1]cuda:0" = torch.ops.aten.lt.Scalar(arg0_1, 64128) + bitwise_and: "b8[s72][1]cuda:0" = torch.ops.aten.bitwise_and.Tensor(ge, lt); ge = lt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:168 in get_masked_input_and_mask, code: added_vocab_mask = (input_ >= added_vocab_start_index) & ( + ge_1: "b8[s72][1]cuda:0" = torch.ops.aten.ge.Scalar(arg0_1, 128256) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:169 in get_masked_input_and_mask, code: input_ < added_vocab_end_index + lt_1: "b8[s72][1]cuda:0" = torch.ops.aten.lt.Scalar(arg0_1, 128256) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:168 in get_masked_input_and_mask, code: added_vocab_mask = (input_ >= added_vocab_start_index) & ( + bitwise_and_1: "b8[s72][1]cuda:0" = torch.ops.aten.bitwise_and.Tensor(ge_1, lt_1); ge_1 = lt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:179 in get_masked_input_and_mask, code: vocab_mask = org_vocab_mask | added_vocab_mask + bitwise_or: "b8[s72][1]cuda:0" = torch.ops.aten.bitwise_or.Tensor(bitwise_and, bitwise_and_1); bitwise_and = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:181 in get_masked_input_and_mask, code: return input_, ~vocab_mask + bitwise_not: "b8[s72][1]cuda:0" = torch.ops.aten.bitwise_not.default(bitwise_or) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:478 in forward_native, code: output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) + unsqueeze: "b8[s72, 1][1, 1]cuda:0" = torch.ops.aten.unsqueeze.default(bitwise_not, -1); bitwise_not = None + full_default_1: "bf16[][]cuda:0" = torch.ops.aten.full.default([], 0.0, dtype = torch.bfloat16, layout = torch.strided, device = device(type='cuda', index=0), pin_memory = False) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:176 in get_masked_input_and_mask, code: valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + mul_2: "i64[s72][1]cuda:0" = torch.ops.aten.mul.Tensor(bitwise_and_1, 64128); bitwise_and_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:180 in get_masked_input_and_mask, code: input_ = vocab_mask * (input_ - valid_offset) + sub_10: "i64[s72][1]cuda:0" = torch.ops.aten.sub.Tensor(arg0_1, mul_2); arg0_1 = mul_2 = None + mul_6: "i64[s72][1]cuda:0" = torch.ops.aten.mul.Tensor(bitwise_or, sub_10); bitwise_or = sub_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:72 in embedding, code: return F.embedding(input_, layer.weight) + embedding: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.embedding.default(arg2_1, mul_6); arg2_1 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py:478 in forward_native, code: output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) + where: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.where.self(unsqueeze, full_default_1, embedding); unsqueeze = full_default_1 = embedding = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce.default(where, 'tp:0'); where = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + convert_element_type: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.prims.convert_element_type.default(all_reduce, torch.float32) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.pow.Tensor_Scalar(convert_element_type, 2) + mean: "f32[s72, 1][1, 1]cuda:0" = torch.ops.aten.mean.dim(pow_1, [-1], True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_54: "f32[s72, 1][1, 1]cuda:0" = torch.ops.aten.add.Tensor(mean, 1e-05); mean = None + rsqrt: "f32[s72, 1][1, 1]cuda:0" = torch.ops.aten.rsqrt.default(add_54); add_54 = None + mul_25: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.mul.Tensor(convert_element_type, rsqrt); convert_element_type = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + convert_element_type_1: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.prims.convert_element_type.default(mul_25, torch.bfloat16); mul_25 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_30: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.mul.Tensor(convert_element_type_1, arg3_1); convert_element_type_1 = arg3_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + permute: "bf16[4096, 3072][1, 4096]cuda:0" = torch.ops.aten.permute.default(arg4_1, [1, 0]); arg4_1 = None + mm: "bf16[s72, 3072][3072, 1]cuda:0" = torch.ops.aten.mm.default(mul_30, permute); mul_30 = permute = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_with_sizes = torch.ops.aten.split_with_sizes.default(mm, [2048, 512, 512], -1); mm = None + getitem: "bf16[s72, 2048][3072, 1]cuda:0" = split_with_sizes[0] + getitem_1: "bf16[s72, 512][3072, 1]cuda:0" = split_with_sizes[1] + getitem_2: "bf16[s72, 512][3072, 1]cuda:0" = split_with_sizes[2]; split_with_sizes = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index: "bf16[s72, 128][128, 1]cuda:0" = torch.ops.aten.index.Tensor(arg6_1, [arg5_1]); arg6_1 = arg5_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + split = torch.ops.aten.split.Tensor(index, 64, -1); index = None + getitem_3: "bf16[s72, 64][128, 1]cuda:0" = split[0] + getitem_4: "bf16[s72, 64][128, 1]cuda:0" = split[1]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = torch.ops.aten.reshape.default(getitem, [arg1_1, -1, 128]); getitem = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + split_1 = torch.ops.aten.split.Tensor(view, 64, -1); view = None + getitem_5: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = split_1[0] + getitem_6: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = split_1[1]; split_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = torch.ops.aten.reshape.default(getitem_1, [arg1_1, -1, 128]); getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + split_2 = torch.ops.aten.split.Tensor(view_2, 64, -1); view_2 = None + getitem_7: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = split_2[0] + getitem_8: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = split_2[1]; split_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = torch.ops.aten.unsqueeze.default(getitem_3, -2) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_60: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_5, unsqueeze_1) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = torch.ops.aten.unsqueeze.default(getitem_4, -2) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_63: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_6, unsqueeze_2) + sub_44: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.sub.Tensor(mul_60, mul_63); mul_60 = mul_63 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_68: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_6, unsqueeze_1); getitem_6 = unsqueeze_1 = None + mul_71: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_5, unsqueeze_2); getitem_5 = unsqueeze_2 = None + add_137: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.add.Tensor(mul_68, mul_71); mul_68 = mul_71 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.ops.aten.cat.default([sub_44, add_137], -1); sub_44 = add_137 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = torch.ops.aten.unsqueeze.default(getitem_3, -2); getitem_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_99: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_7, unsqueeze_3) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_4: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = torch.ops.aten.unsqueeze.default(getitem_4, -2); getitem_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_102: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_8, unsqueeze_4) + sub_61: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.sub.Tensor(mul_99, mul_102); mul_99 = mul_102 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_107: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_8, unsqueeze_3); getitem_8 = unsqueeze_3 = None + mul_110: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_7, unsqueeze_4); getitem_7 = unsqueeze_4 = None + add_199: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.add.Tensor(mul_107, mul_110); mul_107 = mul_110 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_1: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.ops.aten.cat.default([sub_61, add_199], -1); sub_61 = add_199 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_7: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = torch.ops.aten.reshape.default(getitem_2, [-1, 4, 128]); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048][2048, 1]cuda:0" = torch.ops.aten.empty.memory_format([arg1_1, 2048], dtype = torch.bfloat16, device = device(type='cuda', index=0), pin_memory = False); arg1_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_5: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.ops.aten.reshape.default(empty, [-1, 16, 128]); empty = None + return (cat, cat_1, view_7, view_5, all_reduce) + +V0127 17:17:48.746000 1175001 /data/users/angelayi/vllm/vllm/compilation/piecewise_backend.py:142] {"artifact": {"name": "vllm_piecewise_compile_start", "encoding": "json"}, "rank": 0, "stack": [{"line": 1, "name": "", "filename": 1, "loc": ""}, {"line": 122, "name": "spawn_main", "filename": 2, "loc": "exitcode = _main(fd, parent_sentinel)"}, {"line": 135, "name": "_main", "filename": 2, "loc": "return self._bootstrap(parent_sentinel)"}, {"line": 314, "name": "_bootstrap", "filename": 3, "loc": "self.run()"}, {"line": 108, "name": "run", "filename": 3, "loc": "self._target(*self._args, **self._kwargs)"}, {"line": 742, "name": "worker_main", "filename": 4, "loc": "worker.worker_busy_loop(cancel=shutdown_event)"}, {"line": 819, "name": "worker_busy_loop", "filename": 4, "loc": "output = func(*args, **kwargs)"}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 340, "name": "determine_available_memory", "filename": 6, "loc": "self.model_runner.profile_run()"}, {"line": 4516, "name": "profile_run", "filename": 7, "loc": "hidden_states, last_hidden_states = self._dummy_run("}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 4217, "name": "_dummy_run", "filename": 7, "loc": "outputs = self.model("}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 623, "name": "forward", "filename": 10, "loc": "model_output = self.model("}, {"line": 526, "name": "__call__", "filename": 11, "loc": "output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)"}, {"line": 218, "name": "__call__", "filename": 12, "loc": "return self._call_with_optional_nvtx_range("}, {"line": 109, "name": "_call_with_optional_nvtx_range", "filename": 12, "loc": "return callable_fn(*args, **kwargs)"}, {"line": 953, "name": "compile_wrapper", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 412, "name": "forward", "filename": 10, "loc": "def forward("}, {"line": 1181, "name": "_fn", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 54, "name": "__call__", "filename": 44, "loc": "return self.optimized_call(*args, **kwargs)"}, {"line": 936, "name": "call_wrapped", "filename": 45, "loc": "return self._wrapped_call(self, *args, **kwargs)"}, {"line": 442, "name": "__call__", "filename": 45, "loc": "return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 209, "name": "forward", "filename": 46, "loc": "submod_2 = self.submod_2(getitem_3, s72, l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_, getitem_4, l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_3 = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = getitem_4 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = None"}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 222, "name": "__call__", "filename": 47, "loc": "self._maybe_compile_for_range_entry(range_entry, args)"}, {"line": 178, "name": "_maybe_compile_for_range_entry", "filename": 47, "loc": "self._log_compile_start(range_entry.compile_range)"}, {"line": 142, "name": "_log_compile_start", "filename": 47, "loc": "trace_structured("}], "has_payload": "e6452abf5e18e68fd901493cfdaa8f9f"} + {"piecewise_index": 1, "submod_name": "submod_2", "total_piecewise_compiles": 33, "compile_range_start": 1, "compile_range_end": 16384, "is_single_size": false, "is_cudagraph_capture_size": false} +V0127 17:17:48.751000 1175001 /data/users/angelayi/vllm/vllm/compilation/piecewise_backend.py:165] {"graph_dump": {"name": "vllm_submod_2"}, "rank": 0, "stack": [{"line": 1, "name": "", "filename": 1, "loc": ""}, {"line": 122, "name": "spawn_main", "filename": 2, "loc": "exitcode = _main(fd, parent_sentinel)"}, {"line": 135, "name": "_main", "filename": 2, "loc": "return self._bootstrap(parent_sentinel)"}, {"line": 314, "name": "_bootstrap", "filename": 3, "loc": "self.run()"}, {"line": 108, "name": "run", "filename": 3, "loc": "self._target(*self._args, **self._kwargs)"}, {"line": 742, "name": "worker_main", "filename": 4, "loc": "worker.worker_busy_loop(cancel=shutdown_event)"}, {"line": 819, "name": "worker_busy_loop", "filename": 4, "loc": "output = func(*args, **kwargs)"}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 340, "name": "determine_available_memory", "filename": 6, "loc": "self.model_runner.profile_run()"}, {"line": 4516, "name": "profile_run", "filename": 7, "loc": "hidden_states, last_hidden_states = self._dummy_run("}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 4217, "name": "_dummy_run", "filename": 7, "loc": "outputs = self.model("}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 623, "name": "forward", "filename": 10, "loc": "model_output = self.model("}, {"line": 526, "name": "__call__", "filename": 11, "loc": "output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)"}, {"line": 218, "name": "__call__", "filename": 12, "loc": "return self._call_with_optional_nvtx_range("}, {"line": 109, "name": "_call_with_optional_nvtx_range", "filename": 12, "loc": "return callable_fn(*args, **kwargs)"}, {"line": 953, "name": "compile_wrapper", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 412, "name": "forward", "filename": 10, "loc": "def forward("}, {"line": 1181, "name": "_fn", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 54, "name": "__call__", "filename": 44, "loc": "return self.optimized_call(*args, **kwargs)"}, {"line": 936, "name": "call_wrapped", "filename": 45, "loc": "return self._wrapped_call(self, *args, **kwargs)"}, {"line": 442, "name": "__call__", "filename": 45, "loc": "return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 209, "name": "forward", "filename": 46, "loc": "submod_2 = self.submod_2(getitem_3, s72, l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_, getitem_4, l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_3 = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = getitem_4 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = None"}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 222, "name": "__call__", "filename": 47, "loc": "self._maybe_compile_for_range_entry(range_entry, args)"}, {"line": 178, "name": "_maybe_compile_for_range_entry", "filename": 47, "loc": "self._log_compile_start(range_entry.compile_range)"}, {"line": 165, "name": "_log_compile_start", "filename": 47, "loc": "trace_structured("}], "has_payload": "d42d1a58887db1b20c5ad74f8f817d28"} + class GraphModule(torch.nn.Module): + def forward(self, output_4: "bf16[s72, 16, 128]", s72: "Sym(s72)", l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_: "bf16[4096, 2048]", l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_: "bf16[4096]", output: "bf16[s72, 4096]", l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_: "bf16[14336, 4096]", l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_: "bf16[4096, 7168]", l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_: "bf16[4096]", l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_: "bf16[3072, 4096]", l_positions_: "i64[s72]", l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_: "bf16[131072, 128]"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048]" = output_4.view(-1, 2048); output_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear: "bf16[s72, 4096]" = torch._C._nn.linear(view, l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, None); view = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear, group_name = 'tp:0'); linear = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_); l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to: "f32[s72, 4096]" = all_reduce.to(torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add: "f32[s72, 4096]" = to + output; to = output = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_1: "bf16[s72, 4096]" = add.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096]" = add.pow(2) + mean: "f32[s72, 1]" = pow_1.mean(dim = -1, keepdim = True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_1: "f32[s72, 1]" = mean + 1e-05; mean = None + rsqrt: "f32[s72, 1]" = torch.rsqrt(add_1); add_1 = None + mul: "f32[s72, 4096]" = add * rsqrt; add = rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_2: "bf16[s72, 4096]" = mul.to(torch.bfloat16); mul = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_1: "bf16[s72, 4096]" = to_2 * _get_data_attr; to_2 = _get_data_attr = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_1: "bf16[s72, 14336]" = torch._C._nn.linear(mul_1, l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, None); mul_1 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + getitem: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(None, 7168, None))] + silu: "bf16[s72, 7168]" = torch.nn.functional.silu(getitem); getitem = None + getitem_1: "bf16[s72, 7168]" = linear_1[(Ellipsis, slice(7168, None, None))]; linear_1 = None + mul_2: "bf16[s72, 7168]" = silu * getitem_1; silu = getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_2: "bf16[s72, 4096]" = torch._C._nn.linear(mul_2, l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, None); mul_2 = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096]" = torch.ops.vllm.all_reduce(linear_2, group_name = 'tp:0'); linear_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:189 in forward_native, code: self.weight.data if self.has_weight else None, + _get_data_attr_1: "bf16[4096]" = torch._C._autograd._get_data_attr(l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_); l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + to_3: "f32[s72, 4096]" = all_reduce_1.to(torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_2: "f32[s72, 4096]" = to_3 + to_1; to_3 = to_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + to_4: "bf16[s72, 4096]" = add_2.to(torch.bfloat16) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096]" = add_2.pow(2) + mean_1: "f32[s72, 1]" = pow_2.mean(dim = -1, keepdim = True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_3: "f32[s72, 1]" = mean_1 + 1e-05; mean_1 = None + rsqrt_1: "f32[s72, 1]" = torch.rsqrt(add_3); add_3 = None + mul_3: "f32[s72, 4096]" = add_2 * rsqrt_1; add_2 = rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + to_5: "bf16[s72, 4096]" = mul_3.to(torch.bfloat16); mul_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_4: "bf16[s72, 4096]" = to_5 * _get_data_attr_1; to_5 = _get_data_attr_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + linear_3: "bf16[s72, 3072]" = torch._C._nn.linear(mul_4, l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, None); mul_4 = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split = linear_3.split([2048, 512, 512], dim = -1); linear_3 = None + getitem_2: "bf16[s72, 2048]" = split[0] + getitem_3: "bf16[s72, 512]" = split[1] + getitem_4: "bf16[s72, 512]" = split[2]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:121 in forward_static, code: positions = positions.flatten() + flatten: "i64[s72]" = l_positions_.flatten(); l_positions_ = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index_select: "bf16[s72, 128]" = l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_.index_select(0, flatten); l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_ = flatten = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + chunk = index_select.chunk(2, dim = -1); index_select = None + getitem_5: "bf16[s72, 64]" = chunk[0] + getitem_6: "bf16[s72, 64]" = chunk[1]; chunk = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:126 in forward_static, code: query_shape = query.shape + size = getitem_2.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128]" = getitem_2.view(s72, -1, 128); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:128 in forward_static, code: query_rot = query[..., :rotary_dim] + getitem_7: "bf16[s72, 16, 128]" = view_1[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:129 in forward_static, code: query_pass = query[..., rotary_dim:] + getitem_8: "bf16[s72, 16, 0]" = view_1[(Ellipsis, slice(128, None, None))]; view_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2) + to_6: "bf16[s72, 1, 64]" = unsqueeze.to(torch.bfloat16); unsqueeze = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2) + to_7: "bf16[s72, 1, 64]" = unsqueeze_1.to(torch.bfloat16); unsqueeze_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_1 = torch.chunk(getitem_7, 2, dim = -1); getitem_7 = None + getitem_9: "bf16[s72, 16, 64]" = chunk_1[0] + getitem_10: "bf16[s72, 16, 64]" = chunk_1[1]; chunk_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_5: "bf16[s72, 16, 64]" = getitem_9 * to_6 + mul_6: "bf16[s72, 16, 64]" = getitem_10 * to_7 + sub: "bf16[s72, 16, 64]" = mul_5 - mul_6; mul_5 = mul_6 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_7: "bf16[s72, 16, 64]" = getitem_10 * to_6; getitem_10 = to_6 = None + mul_8: "bf16[s72, 16, 64]" = getitem_9 * to_7; getitem_9 = to_7 = None + add_4: "bf16[s72, 16, 64]" = mul_7 + mul_8; mul_7 = mul_8 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128]" = torch.cat((sub, add_4), dim = -1); sub = add_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:136 in forward_static, code: query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + cat_1: "bf16[s72, 16, 128]" = torch.cat((cat, getitem_8), dim = -1); cat = getitem_8 = None + reshape: "bf16[s72, 2048]" = cat_1.reshape(size); cat_1 = size = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:140 in forward_static, code: key_shape = key.shape + size_1 = getitem_3.size() + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_2: "bf16[s72, 4, 128]" = getitem_3.view(s72, -1, 128); getitem_3 = s72 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:142 in forward_static, code: key_rot = key[..., :rotary_dim] + getitem_11: "bf16[s72, 4, 128]" = view_2[(Ellipsis, slice(None, 128, None))] + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:143 in forward_static, code: key_pass = key[..., rotary_dim:] + getitem_12: "bf16[s72, 4, 0]" = view_2[(Ellipsis, slice(128, None, None))]; view_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64]" = getitem_5.unsqueeze(-2); getitem_5 = None + to_8: "bf16[s72, 1, 64]" = unsqueeze_2.to(torch.bfloat16); unsqueeze_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64]" = getitem_6.unsqueeze(-2); getitem_6 = None + to_9: "bf16[s72, 1, 64]" = unsqueeze_3.to(torch.bfloat16); unsqueeze_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + chunk_2 = torch.chunk(getitem_11, 2, dim = -1); getitem_11 = None + getitem_13: "bf16[s72, 4, 64]" = chunk_2[0] + getitem_14: "bf16[s72, 4, 64]" = chunk_2[1]; chunk_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_9: "bf16[s72, 4, 64]" = getitem_13 * to_8 + mul_10: "bf16[s72, 4, 64]" = getitem_14 * to_9 + sub_1: "bf16[s72, 4, 64]" = mul_9 - mul_10; mul_9 = mul_10 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_11: "bf16[s72, 4, 64]" = getitem_14 * to_8; getitem_14 = to_8 = None + mul_12: "bf16[s72, 4, 64]" = getitem_13 * to_9; getitem_13 = to_9 = None + add_5: "bf16[s72, 4, 64]" = mul_11 + mul_12; mul_11 = mul_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_2: "bf16[s72, 4, 128]" = torch.cat((sub_1, add_5), dim = -1); sub_1 = add_5 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:150 in forward_static, code: key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + cat_3: "bf16[s72, 4, 128]" = torch.cat((cat_2, getitem_12), dim = -1); cat_2 = getitem_12 = None + reshape_1: "bf16[s72, 512]" = cat_3.reshape(size_1); cat_3 = size_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:325 in forward, code: output_shape = output_shape if output_shape is not None else query.shape + size_2 = reshape.size() + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048]" = torch.empty(size_2, dtype = torch.bfloat16, device = device(type='cuda', index=0)); size_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:331 in forward, code: query = query.view(-1, self.num_heads, self.head_size) + view_3: "bf16[s72, 16, 128]" = reshape.view(-1, 16, 128); reshape = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_4: "bf16[s72, 16, 128]" = empty.view(-1, 16, 128); empty = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:334 in forward, code: key = key.view(-1, self.num_kv_heads, self.head_size) + view_5: "bf16[s72, 4, 128]" = reshape_1.view(-1, 4, 128); reshape_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_6: "bf16[s72, 4, 128]" = getitem_4.view(-1, 4, 128); getitem_4 = None + return (view_3, view_5, view_6, view_4, to_4) + +V0127 17:17:49.085000 1175001 site-packages/torch/_inductor/compile_fx.py:1335] {"artifact": {"name": "inductor_post_grad_graph", "encoding": "string"}, "rank": 0, "stack": [{"line": 1, "name": "", "filename": 1, "loc": ""}, {"line": 122, "name": "spawn_main", "filename": 2, "loc": "exitcode = _main(fd, parent_sentinel)"}, {"line": 135, "name": "_main", "filename": 2, "loc": "return self._bootstrap(parent_sentinel)"}, {"line": 314, "name": "_bootstrap", "filename": 3, "loc": "self.run()"}, {"line": 108, "name": "run", "filename": 3, "loc": "self._target(*self._args, **self._kwargs)"}, {"line": 742, "name": "worker_main", "filename": 4, "loc": "worker.worker_busy_loop(cancel=shutdown_event)"}, {"line": 819, "name": "worker_busy_loop", "filename": 4, "loc": "output = func(*args, **kwargs)"}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 340, "name": "determine_available_memory", "filename": 6, "loc": "self.model_runner.profile_run()"}, {"line": 4516, "name": "profile_run", "filename": 7, "loc": "hidden_states, last_hidden_states = self._dummy_run("}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 4217, "name": "_dummy_run", "filename": 7, "loc": "outputs = self.model("}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 623, "name": "forward", "filename": 10, "loc": "model_output = self.model("}, {"line": 526, "name": "__call__", "filename": 11, "loc": "output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)"}, {"line": 218, "name": "__call__", "filename": 12, "loc": "return self._call_with_optional_nvtx_range("}, {"line": 109, "name": "_call_with_optional_nvtx_range", "filename": 12, "loc": "return callable_fn(*args, **kwargs)"}, {"line": 953, "name": "compile_wrapper", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 412, "name": "forward", "filename": 10, "loc": "def forward("}, {"line": 1181, "name": "_fn", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 54, "name": "__call__", "filename": 44, "loc": "return self.optimized_call(*args, **kwargs)"}, {"line": 936, "name": "call_wrapped", "filename": 45, "loc": "return self._wrapped_call(self, *args, **kwargs)"}, {"line": 442, "name": "__call__", "filename": 45, "loc": "return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 209, "name": "forward", "filename": 46, "loc": "submod_2 = self.submod_2(getitem_3, s72, l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_, getitem_4, l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_3 = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = getitem_4 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = None"}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 222, "name": "__call__", "filename": 47, "loc": "self._maybe_compile_for_range_entry(range_entry, args)"}, {"line": 189, "name": "_maybe_compile_for_range_entry", "filename": 47, "loc": "range_entry.runnable = self.vllm_backend.compiler_manager.compile("}, {"line": 245, "name": "compile", "filename": 43, "loc": "compiled_graph, handle = self.compiler.compile("}, {"line": 233, "name": "compile", "filename": 48, "loc": "compiled_graph = standalone_compile("}, {"line": 445, "name": "standalone_compile", "filename": 49, "loc": "return standalone_compile("}, {"line": 423, "name": "standalone_compile", "filename": 50, "loc": "compiled_fn = compile_fx("}, {"line": 2486, "name": "compile_fx", "filename": 51, "loc": "return compile_fx("}, {"line": 2537, "name": "compile_fx", "filename": 51, "loc": "return _maybe_wrap_and_compile_fx_main("}, {"line": 2614, "name": "_maybe_wrap_and_compile_fx_main", "filename": 51, "loc": "return _compile_fx_main("}, {"line": 2809, "name": "_compile_fx_main", "filename": 51, "loc": "return aot_autograd("}, {"line": 123, "name": "__call__", "filename": 52, "loc": "cg = aot_module_simplified(gm, example_inputs, **self.kwargs)"}, {"line": 1115, "name": "aot_module_simplified", "filename": 53, "loc": "compiled_fn, _ = aot_stage2_compile("}, {"line": 357, "name": "aot_stage2_compile", "filename": 55, "loc": "return aot_stage2_inference(aot_state, aot_graph_capture)"}, {"line": 431, "name": "aot_stage2_inference", "filename": 55, "loc": "compiled_fw = _aot_stage2b_inference_compile("}, {"line": 400, "name": "_aot_stage2b_inference_compile", "filename": 55, "loc": "return _aot_stage2b_compile_forward_or_inference("}, {"line": 2300, "name": "_aot_stage2b_compile_forward_or_inference", "filename": 55, "loc": "compiled_fw_func = compiler(fw_module, adjusted_flat_args)"}, {"line": 1249, "name": "__call__", "filename": 57, "loc": "return self.compiler_fn(gm, example_inputs)"}, {"line": 2678, "name": "fw_compiler_base", "filename": 51, "loc": "return compile_fx_forward("}, {"line": 2350, "name": "compile_fx_forward", "filename": 51, "loc": "return inner_compile("}, {"line": 81, "name": "inner", "filename": 58, "loc": "return func(*args, **kwds)"}, {"line": 806, "name": "compile_fx_inner", "filename": 51, "loc": "return wrap_compiler_debug(_compile_fx_inner, compiler_name=\"inductor\")("}, {"line": 146, "name": "debug_wrapper", "filename": 59, "loc": "inner_compiled_fn = compiler_fn(gm, example_inputs)"}, {"line": 1003, "name": "_compile_fx_inner", "filename": 51, "loc": "mb_compiled_graph = fx_codegen_and_compile("}, {"line": 1766, "name": "fx_codegen_and_compile", "filename": 51, "loc": "return scheme.codegen_and_compile(gm, example_inputs, inputs_to_check, graph_kwargs)"}, {"line": 1335, "name": "codegen_and_compile", "filename": 51, "loc": "trace_structured("}], "has_payload": "b9c8f0218ebdc41027e665ef6962619b"} + class (torch.nn.Module): + def forward(self, arg0_1: "bf16[s72, 16, 128][2048, 128, 1]cuda:0", arg1_1: "Sym(s72)", arg2_1: "bf16[4096, 2048][2048, 1]cuda:0", arg3_1: "bf16[4096][1]cuda:0", arg4_1: "bf16[s72, 4096][4096, 1]cuda:0", arg5_1: "bf16[14336, 4096][4096, 1]cuda:0", arg6_1: "bf16[4096, 7168][7168, 1]cuda:0", arg7_1: "bf16[4096][1]cuda:0", arg8_1: "bf16[3072, 4096][4096, 1]cuda:0", arg9_1: "i64[s72][1]cuda:0", arg10_1: "bf16[131072, 128][128, 1]cuda:0"): + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:350 in forward, code: return output.view(-1, hidden_size) + view: "bf16[s72, 2048][2048, 1]cuda:0" = torch.ops.aten.reshape.default(arg0_1, [-1, 2048]); arg0_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + permute: "bf16[2048, 4096][1, 2048]cuda:0" = torch.ops.aten.permute.default(arg2_1, [1, 0]); arg2_1 = None + mm: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.mm.default(view, permute); view = permute = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce.default(mm, 'tp:0'); mm = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + convert_element_type_2: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.prims.convert_element_type.default(all_reduce, torch.float32); all_reduce = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_12: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.add.Tensor(convert_element_type_2, arg4_1); convert_element_type_2 = arg4_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_1: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.pow.Tensor_Scalar(add_12, 2) + mean: "f32[s72, 1][1, 1]cuda:0" = torch.ops.aten.mean.dim(pow_1, [-1], True); pow_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_25: "f32[s72, 1][1, 1]cuda:0" = torch.ops.aten.add.Tensor(mean, 1e-05); mean = None + rsqrt: "f32[s72, 1][1, 1]cuda:0" = torch.ops.aten.rsqrt.default(add_25); add_25 = None + mul_17: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.mul.Tensor(add_12, rsqrt); rsqrt = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + convert_element_type_4: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.prims.convert_element_type.default(mul_17, torch.bfloat16); mul_17 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_22: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.mul.Tensor(convert_element_type_4, arg3_1); convert_element_type_4 = arg3_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + permute_1: "bf16[4096, 14336][1, 4096]cuda:0" = torch.ops.aten.permute.default(arg5_1, [1, 0]); arg5_1 = None + mm_1: "bf16[s72, 14336][14336, 1]cuda:0" = torch.ops.aten.mm.default(mul_22, permute_1); mul_22 = permute_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/activation.py:87 in forward_native, code: return F.silu(x[..., :d]) * x[..., d:] + slice_1: "bf16[s72, 7168][14336, 1]cuda:0" = torch.ops.aten.slice.Tensor(mm_1, 1, 0, 7168) + convert_element_type_7: "f32[s72, 7168][7168, 1]cuda:0" = torch.ops.prims.convert_element_type.default(slice_1, torch.float32); slice_1 = None + sigmoid: "f32[s72, 7168][7168, 1]cuda:0" = torch.ops.aten.sigmoid.default(convert_element_type_7) + mul_29: "f32[s72, 7168][7168, 1]cuda:0" = torch.ops.aten.mul.Tensor(convert_element_type_7, sigmoid); convert_element_type_7 = sigmoid = None + convert_element_type_8: "bf16[s72, 7168][7168, 1]cuda:0" = torch.ops.prims.convert_element_type.default(mul_29, torch.bfloat16); mul_29 = None + slice_2: "bf16[s72, 7168][14336, 1]cuda:0" = torch.ops.aten.slice.Tensor(mm_1, 1, 7168, 9223372036854775807); mm_1 = None + mul_34: "bf16[s72, 7168][7168, 1]cuda:0" = torch.ops.aten.mul.Tensor(convert_element_type_8, slice_2); convert_element_type_8 = slice_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + permute_2: "bf16[7168, 4096][1, 7168]cuda:0" = torch.ops.aten.permute.default(arg6_1, [1, 0]); arg6_1 = None + mm_2: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.mm.default(mul_34, permute_2); mul_34 = permute_2 = None + + # File: /data/users/angelayi/vllm/vllm/distributed/parallel_state.py:500 in all_reduce, code: return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) + all_reduce_1: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.vllm.all_reduce.default(mm_2, 'tp:0'); mm_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:142 in forward_static, code: x = x.to(torch.float32) + convert_element_type_11: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.prims.convert_element_type.default(all_reduce_1, torch.float32); all_reduce_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + convert_element_type_3: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.prims.convert_element_type.default(add_12, torch.bfloat16); add_12 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:147 in forward_static, code: x = x + residual + add_65: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.add.Tensor(convert_element_type_11, convert_element_type_3); convert_element_type_11 = convert_element_type_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:166 in forward_static, code: variance = x_var.pow(2).mean(dim=-1, keepdim=True) + pow_2: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.pow.Tensor_Scalar(add_65, 2) + mean_1: "f32[s72, 1][1, 1]cuda:0" = torch.ops.aten.mean.dim(pow_2, [-1], True); pow_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:168 in forward_static, code: x = x * torch.rsqrt(variance + variance_epsilon) + add_78: "f32[s72, 1][1, 1]cuda:0" = torch.ops.aten.add.Tensor(mean_1, 1e-05); mean_1 = None + rsqrt_1: "f32[s72, 1][1, 1]cuda:0" = torch.ops.aten.rsqrt.default(add_78); add_78 = None + mul_52: "f32[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.mul.Tensor(add_65, rsqrt_1); rsqrt_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:169 in forward_static, code: x = x.to(orig_dtype) + convert_element_type_13: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.prims.convert_element_type.default(mul_52, torch.bfloat16); mul_52 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:171 in forward_static, code: x = x * weight + mul_57: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.aten.mul.Tensor(convert_element_type_13, arg7_1); convert_element_type_13 = arg7_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/utils.py:105 in default_unquantized_gemm, code: return torch.nn.functional.linear(x, weight, bias) + permute_3: "bf16[4096, 3072][1, 4096]cuda:0" = torch.ops.aten.permute.default(arg8_1, [1, 0]); arg8_1 = None + mm_3: "bf16[s72, 3072][3072, 1]cuda:0" = torch.ops.aten.mm.default(mul_57, permute_3); mul_57 = permute_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/models/llama.py:241 in forward, code: q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + split_with_sizes = torch.ops.aten.split_with_sizes.default(mm_3, [2048, 512, 512], -1); mm_3 = None + getitem: "bf16[s72, 2048][3072, 1]cuda:0" = split_with_sizes[0] + getitem_1: "bf16[s72, 512][3072, 1]cuda:0" = split_with_sizes[1] + getitem_2: "bf16[s72, 512][3072, 1]cuda:0" = split_with_sizes[2]; split_with_sizes = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:123 in forward_static, code: cos_sin = cos_sin_cache.index_select(0, positions) + index: "bf16[s72, 128][128, 1]cuda:0" = torch.ops.aten.index.Tensor(arg10_1, [arg9_1]); arg10_1 = arg9_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:124 in forward_static, code: cos, sin = cos_sin.chunk(2, dim=-1) + split = torch.ops.aten.split.Tensor(index, 64, -1); index = None + getitem_3: "bf16[s72, 64][128, 1]cuda:0" = split[0] + getitem_4: "bf16[s72, 64][128, 1]cuda:0" = split[1]; split = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:127 in forward_static, code: query = query.view(num_tokens, -1, head_size) + view_1: "bf16[s72, 16, 128][3072, 128, 1]cuda:0" = torch.ops.aten.reshape.default(getitem, [arg1_1, -1, 128]); getitem = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + split_1 = torch.ops.aten.split.Tensor(view_1, 64, -1); view_1 = None + getitem_5: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = split_1[0] + getitem_6: "bf16[s72, 16, 64][3072, 128, 1]cuda:0" = split_1[1]; split_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/base.py:141 in forward_static, code: key = key.view(num_tokens, -1, head_size) + view_3: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = torch.ops.aten.reshape.default(getitem_1, [arg1_1, -1, 128]); getitem_1 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:164 in forward_static, code: x1, x2 = torch.chunk(x, 2, dim=-1) + split_2 = torch.ops.aten.split.Tensor(view_3, 64, -1); view_3 = None + getitem_7: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = split_2[0] + getitem_8: "bf16[s72, 4, 64][3072, 128, 1]cuda:0" = split_2[1]; split_2 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = torch.ops.aten.unsqueeze.default(getitem_3, -2) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_89: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_5, unsqueeze) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_1: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = torch.ops.aten.unsqueeze.default(getitem_4, -2) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_92: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_6, unsqueeze_1) + sub_46: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.sub.Tensor(mul_89, mul_92); mul_89 = mul_92 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_97: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_6, unsqueeze); getitem_6 = unsqueeze = None + mul_100: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_5, unsqueeze_1); getitem_5 = unsqueeze_1 = None + add_161: "bf16[s72, 16, 64][1024, 64, 1]cuda:0" = torch.ops.aten.add.Tensor(mul_97, mul_100); mul_97 = mul_100 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.ops.aten.cat.default([sub_46, add_161], -1); sub_46 = add_161 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:160 in forward_static, code: cos = cos.unsqueeze(-2).to(x.dtype) + unsqueeze_2: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = torch.ops.aten.unsqueeze.default(getitem_3, -2); getitem_3 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_128: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_7, unsqueeze_2) + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:161 in forward_static, code: sin = sin.unsqueeze(-2).to(x.dtype) + unsqueeze_3: "bf16[s72, 1, 64][128, 64, 1]cuda:0" = torch.ops.aten.unsqueeze.default(getitem_4, -2); getitem_4 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:169 in forward_static, code: o1 = x1 * cos - x2 * sin + mul_131: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_8, unsqueeze_3) + sub_63: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.sub.Tensor(mul_128, mul_131); mul_128 = mul_131 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:170 in forward_static, code: o2 = x2 * cos + x1 * sin + mul_136: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_8, unsqueeze_2); getitem_8 = unsqueeze_2 = None + mul_139: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.mul.Tensor(getitem_7, unsqueeze_3); getitem_7 = unsqueeze_3 = None + add_223: "bf16[s72, 4, 64][256, 64, 1]cuda:0" = torch.ops.aten.add.Tensor(mul_136, mul_139); mul_136 = mul_139 = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/rotary_embedding/common.py:173 in forward_static, code: output = torch.cat((o1, o2), dim=-1) + cat_1: "bf16[s72, 4, 128][512, 128, 1]cuda:0" = torch.ops.aten.cat.default([sub_63, add_223], -1); sub_63 = add_223 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:336 in forward, code: value = value.view(-1, self.num_kv_heads, self.head_size) + view_8: "bf16[s72, 4, 128][3072, 128, 1]cuda:0" = torch.ops.aten.reshape.default(getitem_2, [-1, 4, 128]); getitem_2 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:326 in forward, code: output = torch.empty(output_shape, dtype=output_dtype, device=query.device) + empty: "bf16[s72, 2048][2048, 1]cuda:0" = torch.ops.aten.empty.memory_format([arg1_1, 2048], dtype = torch.bfloat16, device = device(type='cuda', index=0), pin_memory = False); arg1_1 = None + + # File: /data/users/angelayi/vllm/vllm/attention/layer.py:332 in forward, code: output = output.view(-1, self.num_heads, self.head_size) + view_6: "bf16[s72, 16, 128][2048, 128, 1]cuda:0" = torch.ops.aten.reshape.default(empty, [-1, 16, 128]); empty = None + + # File: /data/users/angelayi/vllm/vllm/model_executor/layers/layernorm.py:148 in forward_static, code: residual = x.to(orig_dtype) + convert_element_type_12: "bf16[s72, 4096][4096, 1]cuda:0" = torch.ops.prims.convert_element_type.default(add_65, torch.bfloat16); add_65 = None + return (cat, cat_1, view_8, view_6, convert_element_type_12) + +V0127 17:17:56.779000 1175001 /data/users/angelayi/vllm/vllm/compilation/piecewise_backend.py:142] {"artifact": {"name": "vllm_piecewise_compile_start", "encoding": "json"}, "rank": 0, "stack": [{"line": 1, "name": "", "filename": 1, "loc": ""}, {"line": 122, "name": "spawn_main", "filename": 2, "loc": "exitcode = _main(fd, parent_sentinel)"}, {"line": 135, "name": "_main", "filename": 2, "loc": "return self._bootstrap(parent_sentinel)"}, {"line": 314, "name": "_bootstrap", "filename": 3, "loc": "self.run()"}, {"line": 108, "name": "run", "filename": 3, "loc": "self._target(*self._args, **self._kwargs)"}, {"line": 742, "name": "worker_main", "filename": 4, "loc": "worker.worker_busy_loop(cancel=shutdown_event)"}, {"line": 819, "name": "worker_busy_loop", "filename": 4, "loc": "output = func(*args, **kwargs)"}, {"line": 459, "name": "compile_or_warm_up_model", "filename": 6, "loc": "cuda_graph_memory_bytes = self.model_runner.capture_model()"}, {"line": 4582, "name": "capture_model", "filename": 7, "loc": "self._capture_cudagraphs("}, {"line": 4683, "name": "_capture_cudagraphs", "filename": 7, "loc": "self._dummy_run("}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 4217, "name": "_dummy_run", "filename": 7, "loc": "outputs = self.model("}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 623, "name": "forward", "filename": 10, "loc": "model_output = self.model("}, {"line": 439, "name": "__call__", "filename": 11, "loc": "return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)"}, {"line": 223, "name": "__call__", "filename": 12, "loc": "return self._call_with_optional_nvtx_range("}, {"line": 109, "name": "_call_with_optional_nvtx_range", "filename": 12, "loc": "return callable_fn(*args, **kwargs)"}, {"line": 412, "name": "forward", "filename": 10, "loc": "def forward("}, {"line": 1181, "name": "_fn", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 54, "name": "__call__", "filename": 44, "loc": "return self.optimized_call(*args, **kwargs)"}, {"line": 936, "name": "call_wrapped", "filename": 45, "loc": "return self._wrapped_call(self, *args, **kwargs)"}, {"line": 442, "name": "__call__", "filename": 45, "loc": "return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 202, "name": "forward", "filename": 46, "loc": "submod_0 = self.submod_0(l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); l_input_ids_ = l_self_modules_embed_tokens_parameters_weight_ = l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_ = None"}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 222, "name": "__call__", "filename": 47, "loc": "self._maybe_compile_for_range_entry(range_entry, args)"}, {"line": 178, "name": "_maybe_compile_for_range_entry", "filename": 47, "loc": "self._log_compile_start(range_entry.compile_range)"}, {"line": 142, "name": "_log_compile_start", "filename": 47, "loc": "trace_structured("}], "has_payload": "17fa2874645a622d8f44fb15fe28352f"} + {"piecewise_index": 0, "submod_name": "submod_0", "total_piecewise_compiles": 33, "compile_range_start": 8, "compile_range_end": 8, "is_single_size": true, "is_cudagraph_capture_size": true} +V0127 17:17:56.874000 1175001 /data/users/angelayi/vllm/vllm/compilation/piecewise_backend.py:142] {"artifact": {"name": "vllm_piecewise_compile_start", "encoding": "json"}, "rank": 0, "stack": [{"line": 1, "name": "", "filename": 1, "loc": ""}, {"line": 122, "name": "spawn_main", "filename": 2, "loc": "exitcode = _main(fd, parent_sentinel)"}, {"line": 135, "name": "_main", "filename": 2, "loc": "return self._bootstrap(parent_sentinel)"}, {"line": 314, "name": "_bootstrap", "filename": 3, "loc": "self.run()"}, {"line": 108, "name": "run", "filename": 3, "loc": "self._target(*self._args, **self._kwargs)"}, {"line": 742, "name": "worker_main", "filename": 4, "loc": "worker.worker_busy_loop(cancel=shutdown_event)"}, {"line": 819, "name": "worker_busy_loop", "filename": 4, "loc": "output = func(*args, **kwargs)"}, {"line": 459, "name": "compile_or_warm_up_model", "filename": 6, "loc": "cuda_graph_memory_bytes = self.model_runner.capture_model()"}, {"line": 4582, "name": "capture_model", "filename": 7, "loc": "self._capture_cudagraphs("}, {"line": 4683, "name": "_capture_cudagraphs", "filename": 7, "loc": "self._dummy_run("}, {"line": 124, "name": "decorate_context", "filename": 5, "loc": "return func(*args, **kwargs)"}, {"line": 4217, "name": "_dummy_run", "filename": 7, "loc": "outputs = self.model("}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 623, "name": "forward", "filename": 10, "loc": "model_output = self.model("}, {"line": 439, "name": "__call__", "filename": 11, "loc": "return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)"}, {"line": 223, "name": "__call__", "filename": 12, "loc": "return self._call_with_optional_nvtx_range("}, {"line": 109, "name": "_call_with_optional_nvtx_range", "filename": 12, "loc": "return callable_fn(*args, **kwargs)"}, {"line": 412, "name": "forward", "filename": 10, "loc": "def forward("}, {"line": 1181, "name": "_fn", "filename": 16, "loc": "return fn(*args, **kwargs)"}, {"line": 54, "name": "__call__", "filename": 44, "loc": "return self.optimized_call(*args, **kwargs)"}, {"line": 936, "name": "call_wrapped", "filename": 45, "loc": "return self._wrapped_call(self, *args, **kwargs)"}, {"line": 442, "name": "__call__", "filename": 45, "loc": "return super(self.cls, obj).__call__(*args, **kwargs) # type: ignore[misc]"}, {"line": 1776, "name": "_wrapped_call_impl", "filename": 9, "loc": "return self._call_impl(*args, **kwargs)"}, {"line": 1787, "name": "_call_impl", "filename": 9, "loc": "return forward_call(*args, **kwargs)"}, {"line": 209, "name": "forward", "filename": 46, "loc": "submod_2 = self.submod_2(getitem_3, s72, l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_, getitem_4, l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_); getitem_3 = l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_ = getitem_4 = l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_ = l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_ = l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_ = l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_ = None"}, {"line": 220, "name": "__call__", "filename": 8, "loc": "return self.runnable(*args, **kwargs)"}, {"line": 222, "name": "__call__", "filename": 47, "loc": "self._maybe_compile_for_range_entry(range_entry, args)"}, {"line": 178, "name": "_maybe_compile_for_range_entry", "filename": 47, "loc": "self._log_compile_start(range_entry.compile_range)"}, {"line": 142, "name": "_log_compile_start", "filename": 47, "loc": "trace_structured("}], "has_payload": "6be2b0589586022698956f5b2af9e2fb"} + {"piecewise_index": 1, "submod_name": "submod_2", "total_piecewise_compiles": 33, "compile_range_start": 8, "compile_range_end": 8, "is_single_size": true, "is_cudagraph_capture_size": true} diff --git a/tests/integration_test.rs b/tests/integration_test.rs index aca9427..27d9137 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -2631,3 +2631,42 @@ fn test_graph_execution_order_diagnostics() -> Result<(), Box = output.unwrap().into_iter().collect(); + + // check for vLLM-specific artifacts + let expected_files = [ + "-_0_0_0/vllm_compilation_config", + "-_0_0_0/vllm_piecewise_split_graph", + "-_-_-_-/vllm_submod_0", + "-_-_-_-/vllm_submod_2", + ]; + + for prefix in expected_files { + assert!( + prefix_exists(&map, prefix), + "{} not found in output", + prefix + ); + } + + let index_html = &map[&PathBuf::from("index.html")]; + + assert!(index_html.contains("vLLM Compilation Summary"),); + assert!(index_html.contains("Dynamo Compilation"),); + assert!(index_html.contains("Compilation Configuration"),); + assert!(index_html.contains("Piecewise Split Graph"),); + assert!(index_html.contains("range [1, 16384]"),); + assert!(index_html.contains("size 8"),); + assert!(index_html.contains("submod_0"),); + assert!(index_html.contains("submod_2"),); +}