rag_pipeline_llm.py

# -*- coding: utf-8 -*-
"""RAG-pipeline-LLM.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1D-K_4_n3CzIiCU-PVi291Yf5ZV7eKjWq
"""

# Implementing RAG pipeline and setting up a local LLM
# This can also be used with LLM API


import torch

var = torch.rand(10,20)
var

# Importing / Loading the PDF / Source

import os
import requests


# get the pdf document path

pdf_path = "Human-Nutrition-2020-Edition-1598491699.pdf"

# check
print(os.path.exists(pdf_path))

# download PDF

if not os.path.exists(pdf_path):
    print("doesn't exist...... downloding")
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    #the local filename to save the downloaded file
    filename = pdf_path

    #sending a get request to the URL
    response = requests.get(url)


    # check if the request is succesfull
    if response.status_code == 200:
        # open the file and save it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"file has been downloaded and saved as {filename}")

    else:
        print(f"{response.status_code}")

else:
    print(f"file {pdf_path} exists")

"""Opening the PDF and creating data structure for our source page

{
  page_Number,
  page_cahr_count,
  page_word_count,
  sentence_count,
  token_count
}
"""

pip install pyMuPdf

import fitz # pip install pyMuPdf
from tqdm.auto import tqdm

"""Small helper / Formatter function to process the PDF"""

# small helper / text formatting function to process the pdf

def text_formatter(text: str) -> str:
    """ performs minor formatting on text. """
    cleaned_text = text.replace("\n", " ").strip()


    # more text formatting can go here


    return cleaned_text

# pages_and_texts is very important
# see carefully how its getting generated

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    # empty list
    pages_and_texts = []

    for page_number, page in tqdm(enumerate(doc)):
        # getting the text from pdf
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number - 41,
        "page_char_count": len(text),
        "page_word_count": len(text.split(" ")),
        "page_sentence_count": len(text.split(". ")),
        "page_token_count": len(text)/4, # 1 token ~ 4 characters
        "text":text})


    return pages_and_texts

"""Passing the PDF to process"""

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

pages_and_texts[:10]

import pandas as pd

data = pd.DataFrame(pages_and_texts)

data.head()

data['page_char_count'].max()

data['page_sentence_count'].max()

data['page_token_count'].max()

"""Turning Text into Chunks

Two ways we can do it


*   Using split "."
*   Using NLP library such as spacy / nltk


We can use **sentencizer** pipeline from spacy.
"""

from spacy.lang.en import English

# creating an instance of English

nlp = English()

# creating a senctencizer pipeline

nlp.add_pipe("sentencizer")

# creating a simple document / sample passage to see how it works

source = nlp("hi this is a simple passage to see how it works. hope it works good. and the development will yield the results whuch we are expecting ! thanks.")

list(source.sents)

"""Lets see how text looks without **pipeline**:"""

for values in pages_and_texts:
  print(values['text'])

"""Lets see how it looks with **Sentencizer pipeline**"""

for items in pages_and_texts:
  print(list(nlp(items['text']).sents))

for items in tqdm(pages_and_texts):
    items["sentences"] = list(nlp(items['text']).sents)

    # defualt data type is SpaCy, we want to convert into string

    items['sentences'] = [str(sentences) for sentences in items['sentences']]

    # count of sentences

    items["page_sentences_count_spacy"] = len(items['sentences'])

items

import random
random.sample(pages_and_texts, k=10)

data = pd.DataFrame(pages_and_texts)
data.head(10)

"""**Chunks** 🇰

Concept of splitting large texts / passages into smaller ones.

Already we have identified the sentences.

**Lets split 10 sentences as 1 chunk.** This is completely ustomisable

This helps easier to filter.

So our text can fit into the embedding model

also our context passed into LLM can be more specific.

"""

num_sentence_chunk_size = 10

# create a function to split list of texts recursively to chunk size

# example [20] -> [10,10] or [28] -> [10,10,8]

def split_list(input_list: list[str],
              slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

test_split = split_list(input_list=items['sentences'], slice_size=num_sentence_chunk_size)
test_split

# loop through pages and split sentences into chunks

for items in tqdm(pages_and_texts):
    items["sentence_chunks"] = split_list(input_list=items['sentences'],
                                          slice_size=num_sentence_chunk_size)
    items["num_chunks"] = len(items["sentence_chunks"])

torch.cuda.is_available()

torch.cuda.get_device_name(0)

data

"""**Splitting each chunks to its own item**"""

# regex

import re

# split each chunk into its own item

pages_and_chunks = []

for item in tqdm(pages_and_texts):
    for sentence_chunk in item['sentence_chunks']:
        chunk_dict = {}
        chunk_dict["page_number"] = item['page_number']


        #join the sentences together into a paragraph like structure. aka join the list of sentences into one paragraph.
        joined_sentence_chunk = "".join(sentence_chunk).replace(" ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # this line will convert ".A" to ". A" (will work for capital letters)

        #join sentence chunk
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4  # 1 token = ~4 Chars

        pages_and_chunks.append(chunk_dict)


len(pages_and_chunks)

random.sample(pages_and_chunks, k=1)

data_chunks = pd.DataFrame(pages_and_chunks)
data_chunks[data_chunks['chunk_token_count'] < 30]

for row in data_chunks[data_chunks['chunk_token_count'] <= 30].sample(100).iterrows():
    print(f'chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

pages_and_chunks_over_min_token = data_chunks[data_chunks['chunk_token_count'] > 30].to_dict(orient="records")

random.sample(pages_and_chunks_over_min_token, k=1)

"""**Embeddings**"""

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2")

pip install sentence_transformers

for items in tqdm(pages_and_chunks_over_min_token):
    items["embedding"] = embedding_model.encode(items["sentence_chunk"])

text_chunks = [items['sentence_chunk'] for items in pages_and_chunks_over_min_token]

sample_emb = len(items["embedding"])
sample_emb

text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # batch size will never change the output. but it does make an impact in process timing
                                               convert_to_tensor = True)

text_chunk_embeddings

pages_and_chunks_over_min_token[108]

"""Saving embeddings to a **file**"""

text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token)

embeddings_df_save_path = "text_chunks_and_embeddings.csv"

text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

text_chunk_embeddings_load = pd.read_csv(embeddings_df_save_path)

text_chunk_embeddings_load.head(10)

import random

import numpy as np
import pandas as pd
import torch


# You can run this on a MacBook, however, you will need to change the device to “mps” (Metal Performance Shaders) rather than “cuda”.
# For example, `device = “mps” if torch.backends.mps.is_available() else “cpu”`.

device = "cuda"  if torch.backends.mps.is_available() else "cpu"

text_chunks_and_embeddings_df = pd.read_csv('text_chunks_and_embeddings.csv')


# converting the embedding coloumn back to np.array (it got converted to string when we load it into csv)
text_chunks_and_embeddings_df['embedding'] = text_chunks_and_embeddings_df['embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# converting embeddings into a torch.tensor
# as this one was missing
# embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df['embedding'].to_list(), axis=0)).to(device)


embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df['embedding'].to_list(), axis=0), dtype=torch.float32).to(device)

# convert texts and embedding df to a list of dicts

pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")

text_chunks_and_embeddings_df['embedding']

embeddings_test = text_chunks_and_embeddings_df['embedding'].to_list()

embeddings = np.stack(text_chunks_and_embeddings_df['embedding'].to_list(), axis=0)
embeddings

embeddings_np = np.stack(text_chunks_and_embeddings_df['embedding'].to_list(), axis=0).astype(np.float32)
embeddings = torch.tensor(embeddings_np, device=device)

embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df['embedding'].to_list(), axis=0), dtype=torch.float32).to(device)

embeddings.shape

# create a model

from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                     device=device)


# this helps in case you mess up !

tensor_on_cpu = torch.tensor(np.stack(text_chunks_and_embeddings_df['embedding'].to_list(), axis=0), dtype=torch.float32)

tensor_on_gpu = tensor_on_cpu.to(device)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df['embedding'].to_list(), axis=0), dtype=torch.float32)

# 1
# define the query
query = "macronutrients functions"
# query = "Iodine Deficiency"
# query = 'good foods of protein'
print(f'Query: {query}')

# 2
# embed the query using model.encode()
# note: its important to embed the query into same model which we embed the passages
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3
# get similarity scores with the dot product (use cosine similarity if outputs of model isn't normalized)

# lets see how fast it is in the timer
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=tensor_on_gpu)[0]
end_time = timer()

print(f'[INFO] Time taken to get scores on {len(tensor_on_gpu)} | emeddings : {end_time-start_time:.5f} seconds')


# 4
# we only want top-k / top-5  results
# ref -> https://pytorch.org/docs/stable/generated/torch.topk.html
top_results_dot_product = torch.topk(dot_scores, k=5) # top-k means top 5 results
top_results_dot_product

pages_and_chunks[42]

# multiple with 1000

larger_embeddings = torch.randn(1000*embeddings.shape[0],768).to(device)

print(f'Embeddings shape : {embeddings.shape[0]}')

larger_embeddings_on_gpu = larger_embeddings.to(device)
query_embedding = query_embedding.to(device)

# perform dot product across 168000 embeddings

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=larger_embeddings_on_gpu)[0]
end_time = timer()

print(f'[INFO] Time taken to get scores on {len(larger_embeddings_on_gpu)} | emeddings : {end_time-start_time:.5f} seconds')

import textwrap

def text_wrap(text, wrap_length=80):
    wrapped_text = textwrap.fill(text,wrap_length)
    print(wrapped_text)

top_results_dot_product

# query = 'good foods of protein'
print(f'Query : {query}\n')
print("Results:")

# loop through the zipped scores of dot product from torch.topk
for values,indices in zip(top_results_dot_product[0],top_results_dot_product[1]):
    print(f'Scores : \n {values:.4f}')
    print('Text:')
    print(pages_and_chunks[indices]['sentence_chunk'])
    print(f'page_number: {pages_and_chunks[indices]["page_number"]}')

print('Done \n')

import fitz


# Open PDF and load the target

pdf_path = 'Human-Nutrition-2020-Edition-1598491699.pdf'
doc = fitz.open(pdf_path)
page = doc.load_page( 5 + 41) # note : page number of pdf starts from 41+ and page number is 5 or 411, change accordingly based on the result from torch.topk


# get the Image of the page

img = page.get_pixmap(dpi=400)


# save image

img.save('outputfile.png')

doc.close()


# convert pixmap to numpy array

img_array = np.frombuffer(img.samples_mv,
                          dtype = np.uint8).reshape((img.h, img.w, img.n))

img_array

# displaying the PDF using matplotlib

import matplotlib.pyplot as plt
plt.figure(figsize=(13,10))
plt.imshow(img_array)
plt.title(f'Query : {query} | Most relevant Passage')

# displaying the PDF using matplotlib

import matplotlib.pyplot as plt
plt.figure(figsize=(13,10))
plt.imshow(img_array)
plt.title(f'Query : {query} | Most relevant Passage')
plt.axis("off")

def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor, #could be a numpy array but numpy can't use GPU while pytorch can
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5, # this is going to be the number of passages, 5 paragraphs
                                print_time: bool=True):
    """
    Embeds a query with 'all-mpnet-base-v2' model and returns top k scores indices for embeddings
    """

    # Embed the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Get dot product scores on embeddings
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f'[INFO] time taken to get scores on {len(embeddings)} embeddings in : {end_time - start_time: .5f} seconds.')

    scores, indices = torch.topk(input=dot_scores,
                                 k=n_resources_to_return)
    return scores, indices

retrieve_relevant_resources(query="foods high in fiber", embeddings=embeddings)

# function to view / read the passages

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict] = pages_and_chunks, # this is were we store our text and data
                                 n_resources_to_return: int=5):

    """
    Finds relevant passages given a query and prints along with the scores
    """

    scores, indices = retrieve_relevant_resources(query=query,
                                                    embeddings=embeddings,
                                                    n_resources_to_return=n_resources_to_return)

    # loop through the zipped scores of dot product from torch.topk
    for values,indices in zip(scores,indices):
        print(f'Scores : \n {values:.4f}')
        print('Text:')
        print(pages_and_chunks[indices]['sentence_chunk'])
        print(f'page_number: {pages_and_chunks[indices]["page_number"]}')

    print('Done \n')

query = "foods high in fiber"
retrieve_relevant_resources(query=query, embeddings=embeddings)
print_top_results_and_scores(query=query, embeddings=embeddings)


"""Getting LLM run locally"""

# checking GPU memory

import torch

if torch.cuda.is_available():
    gpu_memory_reserved_bytes = torch.cuda.memory_reserved(0)
    gpu_memory_gb = round(gpu_memory_reserved_bytes / (2**30))
    print(f'Available GPU memory: {gpu_memory_gb} GB')
else:
    print('GPU is not available.')

torch.cuda.is_available()

# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

from google.colab import userdata
test_token = userdata.get('HF')
print(test_token)

!nvidia-smi

"""Loading an LLM **locally**"""

# we need
# quantization config (on precision eg. 4bit, 8bit etc)
# model ID - this will tell transformers which model / tokenizer to load
# tokenixer -> turns text into numbers (it isdifferent from embeddings. )
# LLM model to generate text based on the input

"""We can load LLM locally using hugging face **transformers**"""

pip install -i https://pypi.org/simple/ bitsandbytes

pip install -U transformers

pip install bitsandbytes==0.42.0

pip install accelerate

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
# from huggingface_hub import login
# HUGGING_FACE_TOKEN = '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'

# login(HUGGING_FACE_TOKEN)

from google.colab import userdata
hf_token = userdata.get('HF')
print(hf_token)

from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)


# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
#model_id = "google/gemma-7b-it"
model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id,
                                          token=hf_token)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16,
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=True, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")

llm_model

print(f"[INFO] Using attention implementation: {attn_implementation}")

"""Lets test the number of **parameters**"""

def get_model_params(model: torch.nn.Module):
  return sum([param.numel() for param in model.parameters()])

get_model_params(llm_model)