Skip to content

Commit

Permalink
Additional chunks based on references and definitions in code and pro…
Browse files Browse the repository at this point in the history
…c tools (#1180)

* saving work

symbol tool

add metadata to code tool chunks

showing metadata

add github to repo

filter empty symbols

fixing some bugs, dedup

fmt

adding aliases

add code to symbol tool

new prompt

add chunk to exchanges

adding symbol ids

save work

now working

adding filter

save work

saving work

remove function call

get ref def outside of exchange

impl agent

refdef to chunks

clean chunk generation

filter_chunks

filter_chunks in new file

fix repo_ref

add to proc

delete symbol tool

warnings

adding constants

relative_path field revert to text

user query simplified

repo_ref to accept github and local

remove filter action, make a function for llm call

better naming and adding comments

fmt

adding some error handling

* fix error handling

* clippy

* fmt

* wip: refactor changes

* using snippet from file_symbol

* adding repo_ref, reverting empty filter, adding filter comments

* tokeninfo response not pub anymore

* simplifying how we update path in a chunk

* symbol alias instead of chunk alias

* moving prompt to prompts.rs

* reverting filter

* fix unwrap in get_token_info

* adding some comments

* typo

* better doc

* make start_byte/end_byte usize and move function call def into prompts.rs

* log selected symbol

* unwrap_or_default

* revert Cargo.lock

* latency

* use all hoverable symbols, tokeninfo only for the selected one

* no function_calling

* filter out local symbols

* removing unused functions

* clippy filter_map

* prompt improved

* fix explanation and add a const

* reformat prompt and inline llm call

* remove dbg! calls

---------

Co-authored-by: Gabriel Gordon-Hall <ggordonhall@gmail.com>
Co-authored-by: Akshay <nerdy@peppe.rs>
  • Loading branch information
3 people authored Jan 8, 2024
1 parent e6325f8 commit 1998fff
Show file tree
Hide file tree
Showing 12 changed files with 462 additions and 59 deletions.
1 change: 1 addition & 0 deletions server/bleep/src/agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ const MAX_STEPS: usize = 10;
pub mod exchange;
pub mod model;
pub mod prompts;
pub mod symbol;
pub mod transcoder;

/// A collection of modules that each add methods to `Agent`.
Expand Down
4 changes: 2 additions & 2 deletions server/bleep/src/agent/exchange.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,14 @@ impl SearchStep {
#[derive(Clone, Debug, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct CodeChunk {
pub path: String,
#[serde(rename = "alias")]
pub alias: usize,
#[serde(rename = "snippet")]
pub snippet: String,
#[serde(rename = "start")]
pub start_line: usize,
#[serde(rename = "end")]
pub end_line: usize,
pub start_byte: Option<usize>,
pub end_byte: Option<usize>,
}

impl CodeChunk {
Expand Down
19 changes: 18 additions & 1 deletion server/bleep/src/agent/prompts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ pub fn system<'a>(paths: impl IntoIterator<Item = &'a str>) -> String {
- DO NOT call a function that you've used before with the same arguments
- DO NOT assume the structure of the codebase, or the existence of files or folders
- Your queries to functions.code or functions.path should be significantly different to previous queries
- Call functions.none with paths that you are confident will help answer the user's query
- Call functions.none with paths that you are confident will help answer the user's query, include paths containing the information needed for a complete answer including definitions and references
- If the user query is general (e.g. 'What does this do?', 'What is this repo?') look for READMEs, documentation and entry points in the code (main files, index files, api files etc.)
- If the user is referring to, or asking for, information that is in your history, call functions.none
- If after attempting to gather information you are still unsure how to answer the query, call functions.none
Expand Down Expand Up @@ -386,6 +386,23 @@ Here is the full context for reference:
)
}

pub fn symbol_classification_prompt(snippets: &str) -> String {
format!(
r#"{snippets}
Above are code chunks and non-local symbols that have been extracted from the chunks. Each chunk is followed by an enumerated list of symbols that it contains. Given a user query, select the symbol which is most relevant to it, e.g. the references or definition of this symbol would help somebody answer the query. Symbols which are language builtins or which come from third party libraries are unlikely to be helpful.
Do not answer with the symbol name, use the symbol index.
### Examples ###
Q: how does ranking work?
23
Q: which function makes an api call
3"#
)
}

pub fn hypothetical_document_prompt(query: &str) -> String {
format!(
r#"Write a code snippet that could hypothetically be returned by a code search engine as the answer to the query: {query}
Expand Down
323 changes: 323 additions & 0 deletions server/bleep/src/agent/symbol.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
use crate::agent::{exchange::CodeChunk, Agent};
use crate::intelligence::{code_navigation::FileSymbols, Language, TSLanguage};
use crate::llm_gateway;
use crate::webserver::intelligence::{get_token_info, TokenInfoRequest};
use anyhow::{Context, Result};
use tracing::log::{debug, info, warn};

use super::prompts::symbol_classification_prompt;

pub struct ChunkWithHoverableSymbols {
pub chunk: CodeChunk,
pub symbols: Vec<HoverableSymbol>,
}

/// This helps the code and proc tool return related chunks based on references and definitions.
/// `get_related_chunks` receives a list of chunks from code or proc search and returns `MAX_CHUNKS` related chunks
/// For each input chunk, we extract all symbols (variables, function names, structs...).
/// Then we disconsider symbols that are defined in the same file using the scope graph.
/// We then pick ONE symbol using a classifier (`filter_symbols`), where the classifier has access to user query, original chunks and filtered list of symbols.
/// This selected symbol may be present in many files one or more times.
/// We extract the surrounding code (up to `NUMBER_CHUNK_LINES` lines) for each occurence and pick `MAX_CHUNKS` occurrences/chunks.
impl Agent {
pub async fn extract_hoverable_symbols(
&self,
chunk: CodeChunk,
) -> Result<ChunkWithHoverableSymbols> {
// get hoverable elements
let document = self
.app
.indexes
.file
.by_path(&self.repo_ref, &chunk.path, None)
.await?
.with_context(|| format!("failed to read path: {}", &chunk.path))?;

let graph = document
.symbol_locations
.scope_graph()
.with_context(|| format!("no scope graph for file: {}", &chunk.path))?;

let hoverable_ranges = document
.hoverable_ranges()
.ok_or_else(|| anyhow::anyhow!("no hoverable ranges"))?;

let mut symbols = hoverable_ranges
.into_iter()
.filter(|range| {
(range.start.byte >= chunk.start_byte.unwrap_or_default())
&& (range.start.byte < chunk.end_byte.unwrap_or_default())
})
.filter(|range| {
// if this node can be resolved locally in the scope-graph, omit it
if let Some(node_by_range) = graph.node_by_range(range.start.byte, range.end.byte) {
if graph.is_reference(node_by_range) || graph.is_definition(node_by_range) {
return false;
}
}
true
})
.map(|range| HoverableSymbol {
name: chunk.snippet[(range.start.byte - chunk.start_byte.unwrap_or_default())
..(range.end.byte - chunk.start_byte.unwrap_or_default())]
.to_string(),
token_info_request: TokenInfoRequest {
relative_path: chunk.path.clone(),
repo_ref: self.repo_ref.display_name(),
branch: None,
start: range.start.byte,
end: range.end.byte,
},
path: chunk.path.clone(),
})
.collect::<Vec<_>>();

symbols.sort_by(|a, b| a.name.cmp(&b.name));
symbols.dedup_by(|a, b| a.name == b.name);

debug!(
"Attached {} symbols: {:?}",
symbols.len(),
symbols.iter().map(|s| s.name.as_str()).collect::<Vec<_>>()
);

Ok(ChunkWithHoverableSymbols {
chunk: chunk.clone(),
symbols,
})
}

pub async fn expand_symbol_into_chunks(&self, symbol: Symbol) -> Vec<CodeChunk> {
// each symbol may be in multiple files and have multiple occurences in each file
symbol
.related_symbols
.iter()
.flat_map(|file_symbols| {
let filename = file_symbols.file.clone();

file_symbols
.data
.iter()
.map(|occurrence| CodeChunk {
path: filename.clone(),
alias: 0,
snippet: occurrence.snippet.data.clone(),
start_line: occurrence.snippet.line_range.start,
end_line: occurrence.snippet.line_range.end,
start_byte: None,
end_byte: None,
})
.collect::<Vec<_>>()
})
.collect::<Vec<_>>()
}

pub async fn filter_symbols(
&self,
query: &str,
chunks_with_symbols: Vec<ChunkWithHoverableSymbols>,
) -> Result<Symbol, SymbolError> {
if chunks_with_symbols.is_empty() {
return Err(SymbolError::ListEmpty);
}

const NUMBER_CHUNK_LINES: usize = 10;

// we have multiples chunks and each chunk may have multiple symbols
// unique alias (i) per symbol
let mut i: i32 = -1;
let symbols = chunks_with_symbols
.into_iter()
.map(|chunk_with_symbol| {
(
chunk_with_symbol.chunk,
chunk_with_symbol
.symbols
.into_iter()
.map(|symbol| {
i += 1;
(i, symbol)
})
.collect::<Vec<_>>(),
)
})
.collect::<Vec<_>>();

// Classifier

// context
let chunks_string = symbols
.iter()
.filter(|(_, s)| !s.is_empty())
.map(|(c, s)| {
let symbols_string = s
.iter()
.map(|(i, refdef)| format!("{}: {}", i, refdef.name))
.collect::<Vec<_>>()
.join("\n");

format!(
"```{}\n{}```\n\n{}",
c.path.clone(),
c.snippet.clone(),
symbols_string
)
})
.collect::<Vec<_>>()
.join("\n\n");

// instruction
let messages = vec![
llm_gateway::api::Message::system(&symbol_classification_prompt(&chunks_string)),
llm_gateway::api::Message::user(query),
];

let response = match self
.llm_gateway
.clone()
.model("gpt-4-0613")
.temperature(0.0)
.chat(&messages, None)
.await
{
Ok(response) => response,
Err(e) => {
warn!(
"Symbol classifier llm call failed, picking the first symbol: {}",
e
);
"0".into()
}
};

let selected_symbol = match response.as_str().parse::<i32>() {
Ok(symbol) => symbol,
Err(e) => {
warn!("Parsing to integer failed, picking the first symbol: {}", e);
0
}
};

// finding symbol metadata
match symbols
.into_iter()
.flat_map(|(_, symbol_with_alias)| symbol_with_alias)
.find(|(alias, _)| *alias == selected_symbol)
{
Some((_alias, symbol_metadata)) => Ok(Symbol {
name: symbol_metadata.name,
related_symbols: {
let document = self
.app
.indexes
.file
.by_path(&self.repo_ref, &symbol_metadata.path, None)
.await
.unwrap()
.unwrap();

let all_docs = {
let associated_langs =
match document.lang.as_deref().map(TSLanguage::from_id) {
Some(Language::Supported(config)) => config.language_ids,
_ => &[],
};
self.app
.indexes
.file
.by_repo(&self.repo_ref, associated_langs.iter(), None)
.await
};

get_token_info(
symbol_metadata.token_info_request,
&self.repo_ref,
self.app.indexes.clone(),
&document,
&all_docs,
Some(0),
Some(NUMBER_CHUNK_LINES),
)
.await
.unwrap()
.into_iter()
.filter(|file_symbol| file_symbol.file != symbol_metadata.path)
.collect::<Vec<_>>()
},
}),
_ => Err(SymbolError::OutOfBounds),
}
}

pub async fn get_related_chunks(&mut self, chunks: Vec<CodeChunk>) -> Vec<CodeChunk> {
const MAX_CHUNKS: usize = 3;

// get symbols with ref/defs for each chunk
let chunks_with_symbols = futures::future::join_all(
chunks
.iter()
.filter(|c| !c.is_empty())
.map(|c| self.extract_hoverable_symbols(c.clone())), // TODO: Log failure
)
.await
.into_iter()
.filter_map(Result::ok)
.collect();

// get original user query
let user_query = self.last_exchange().query.target().unwrap();

// select one symbol
let selected_symbol = match self.filter_symbols(&user_query, chunks_with_symbols).await {
Ok(selected_symbol) => {
info!("Selected symbol: {}", selected_symbol.name);
selected_symbol
}
Err(e) => {
info!("Returning no extra chunks: {}", e);
return Vec::new();
}
};

// take 3 chunks, update path aliases, update enchange chunks
let extra_chunks = self
.expand_symbol_into_chunks(selected_symbol)
.await
.iter()
.take(MAX_CHUNKS)
.map(|c| {
let chunk = CodeChunk {
alias: self.get_path_alias(c.path.as_str()),
..c.clone()
};
self.exchanges
.last_mut()
.unwrap()
.code_chunks
.push(chunk.clone());
chunk
})
.collect::<Vec<_>>();

extra_chunks
}
}

pub struct HoverableSymbol {
pub name: String,
pub token_info_request: TokenInfoRequest,
pub path: String,
}
pub struct Symbol {
pub name: String,
pub related_symbols: Vec<FileSymbols>,
}

#[derive(thiserror::Error, Debug)]
pub enum SymbolError {
#[error("No symbol retrieved in the provided chunks")]
ListEmpty,
#[error("Selected symbol out of bounds")]
OutOfBounds,
}
4 changes: 4 additions & 0 deletions server/bleep/src/agent/tools/answer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,8 @@ impl Agent {
snippet,
start_line: span.start,
end_line: span.end,
start_byte: None,
end_byte: None,
}
})
.collect::<Vec<CodeChunk>>();
Expand All @@ -383,6 +385,8 @@ impl Agent {
snippet: trimmed_snippet.to_string(),
start_line: chunk.start_line,
end_line: (chunk.start_line + num_trimmed_lines).saturating_sub(1),
start_byte: chunk.start_byte,
end_byte: chunk.end_byte,
}]
} else {
code_chunks
Expand Down
Loading

0 comments on commit 1998fff

Please sign in to comment.