Skip to content

Commit

Permalink
k
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 committed Dec 1, 2024
1 parent f12eb4a commit 8f49a6d
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,23 +67,25 @@ def process_token(
if piece_that_comes_after == "\n" and in_code_block(self.llm_out):
self.curr_segment = self.curr_segment.replace("```", "```plaintext")

citation_pattern = r"\[(\d+)\]"
citation_pattern = r"\[(\d+)\]|\[\[(\d+)\]\]"
citations_found = list(re.finditer(citation_pattern, self.curr_segment))
possible_citation_pattern = r"(\[\d*$)" # [1, [, etc
possible_citation_pattern = r"(\[+\d*$)"
possible_citation_found = re.search(
possible_citation_pattern, self.curr_segment
)

if len(citations_found) == 0 and len(self.llm_out) - self.past_cite_count > 5:
self.current_citations = []

result = "" # Initialize result here
result = ""
if citations_found and not in_code_block(self.llm_out):
last_citation_end = 0
length_to_add = 0
while len(citations_found) > 0:
citation = citations_found.pop(0)
numerical_value = int(citation.group(1))
numerical_value = int(
next(group for group in citation.groups() if group is not None)
)

if 1 <= numerical_value <= self.max_citation_num:
context_llm_doc = self.context_docs[numerical_value - 1]
Expand Down Expand Up @@ -131,14 +133,6 @@ def process_token(

link = context_llm_doc.link

# Replace the citation in the current segment
start, end = citation.span()
self.curr_segment = (
self.curr_segment[: start + length_to_add]
+ f"[{target_citation_num}]"
+ self.curr_segment[end + length_to_add :]
)

self.past_cite_count = len(self.llm_out)
self.current_citations.append(target_citation_num)

Expand All @@ -149,6 +143,7 @@ def process_token(
document_id=context_llm_doc.document_id,
)

start, end = citation.span()
if link:
prev_length = len(self.curr_segment)
self.curr_segment = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,16 @@ def process_text(
"Here is some text[[1]](https://0.com). Some other text",
["doc_0"],
),
# ['To', ' set', ' up', ' D', 'answer', ',', ' if', ' you', ' are', ' running', ' it', ' yourself', ' and',
# ' need', ' access', ' to', ' certain', ' features', ' like', ' auto', '-sync', 'ing', ' document',
# '-level', ' access', ' permissions', ',', ' you', ' should', ' reach', ' out', ' to', ' the', ' D',
# 'answer', ' team', ' to', ' receive', ' access', ' [[', '4', ']].', '']
(
"Unique tokens with double brackets and a single token that ends the citation and has characters after it.",
["... to receive access", " [[", "1", "]].", ""],
"... to receive access [[1]](https://0.com).",
["doc_0"],
),
],
)
def test_citation_extraction(
Expand Down

0 comments on commit 8f49a6d

Please sign in to comment.