!pip install -qU llama-index trulens_eval sentence_transformers

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 682.0/682.0 kB 7.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 9.0 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 288.2/288.2 kB 10.4 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 15.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 817.7/817.7 kB 14.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 287.5/287.5 kB 19.1 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.1/8.1 MB 32.3 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.4/3.4 MB 47.1 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 70.5/70.5 kB 7.9 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.4/233.4 kB 22.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.8/78.8 kB 7.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.9/1.9 MB 47.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 104.2/104.2 kB 10.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.0/53.0 kB 5.1 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 207.3/207.3 kB 19.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.8/4.8 MB 56.3 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 83.0/83.0 kB 9.4 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 680.8/680.8 kB 41.4 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 635.4/635.4 kB 33.4 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 51.0 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 62.7/62.7 kB 5.2 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 144.8/144.8 kB 14.3 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 250.8/250.8 kB 21.3 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 57.1 MB/s eta 0:00:00
  Building wheel for millify (setup.py) ... done


import os
from dotenv import load_dotenv,find_dotenv

# # Load OPENAI_API_KEY from local .env file
# load_dotenv(find_dotenv())

# Or set it like this
os.environ["OPENAI_API_KEY"] = "sk-..."

## Print key to check
# print(os.environ["OPENAI_API_KEY"])


!mkdir -p 'data'
!wget 'https://raw.githubusercontent.com/dbredvick/paul-graham-to-kindle/main/paul_graham_essays.txt' -O 'data/paul_graham_essays.txt'

--2024-04-11 21:00:12--  https://raw.githubusercontent.com/dbredvick/paul-graham-to-kindle/main/paul_graham_essays.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3075911 (2.9M) [text/plain]
Saving to: ‘data/paul_graham_essays.txt’

data/paul_graham_es 100%[===================>]   2.93M  --.-KB/s    in 0.07s   

2024-04-11 21:00:13 (42.8 MB/s) - ‘data/paul_graham_essays.txt’ saved [3075911/3075911]


from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./data/paul_graham_essays.txt"]
).load_data()


print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

1 

<class 'llama_index.core.schema.Document'>
Doc ID: d207f5fb-9c17-4376-942d-fc82475478a3
Text: # RSS  [](index.html)             Aaron Swartz created a scraped
[feed](http://www.aaronsw.com/2002/feeds/pgessays.rss) of the essays
page.                    * * *    # This Year We Can End the Death
Penalty in California  [](index.html)             November 2016
If you're a California voter, there is an important proposition on
your ballo...


# If using multiple docs, combine into a single doc to improve chunking performance
# Here we also use a Document object, which stores the text along with `metadata`
# and `relationships` with other Documents/Nodes.

from llama_index.core import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))


from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.settings import Settings

Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
Settings.embed_model = OpenAIEmbedding()


from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents([document])


query_engine = index.as_query_engine()


response = query_engine.query(
    "What's the difference between a maker's schedule and a manager's schedule?"
)
print(str(response))

The difference between a maker's schedule and a manager's schedule lies in how time is structured and utilized. The manager's schedule is divided into one-hour intervals, allowing for frequent task changes and meetings. In contrast, the maker's schedule involves working in larger blocks of time, such as half a day, to focus on creative tasks like programming or writing without interruptions. Meetings are disruptive for those on the maker's schedule as they break the flow of work, while they are more manageable for those on the manager's schedule.


eval_questions = ["What's a good way for someone with kids to fund a startup?",
                  "What are the ten reasons why the US has such a high concentration of startups?",
                  "What are some ways to avoid copying the wrong thing?",
                  "What is 'good procrastination' according to Paul Graham?",
                  "What are the 18 reasons startups fail according to Paul Graham?",
                  "What are the six principles for making new things?",
                  "What is ramen profitable?",
                  "Who are Paul Graham's top founders?",
                  "Which VC investor famously passed on AirBnB after a mutual introduction by Paul Graham?",
                  "What are the things should founders do that don't scale?"]


# Write questions to a file
with open('eval_questions.txt', 'w') as file:
    for item in eval_questions:
        file.write("%s\n" % item)


from trulens_eval import Tru
tru = Tru()

tru.reset_database()


from trulens_eval import (
    Feedback,
    TruLlama,
    OpenAI
)

from trulens_eval.feedback import Groundedness

import numpy as np

openai = OpenAI()

qa_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
    .on_input_output()
)

qs_relevance = (
    Feedback(openai.relevance_with_cot_reasons, name = "Context Relevance")
    .on_input()
    .on(TruLlama.select_source_nodes().node.text)
    .aggregate(np.mean)
)

grounded = Groundedness(groundedness_provider=openai)

groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
        .on(TruLlama.select_source_nodes().node.text)
        .on_output()
        .aggregate(grounded.grounded_statements_aggregator)
)

feedbacks = [qa_relevance, qs_relevance, groundedness]

def get_prebuilt_trulens_recorder(query_engine, app_id):
    tru_recorder = TruLlama(
        query_engine,
        app_id=app_id,
        feedbacks=feedbacks
        )
    return tru_recorder

tru_recorder = get_prebuilt_trulens_recorder(query_engine,
                                             app_id="Direct Query Engine")

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]


records, feedback = tru.get_records_and_feedback(app_ids=[])


records.head()


# Launches on http://localhost:8501/
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Submit this IP Address: 34.73.99.55

<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>


# from llama_index.llms.openai import OpenAI

# llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)


from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core import load_index_from_storage
import os


def build_sentence_window_index(
    document, llm, embed_model=Settings.embed_model, save_dir="sentence_index"
):
    # Create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=3,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    Settings.llm = llm
    Settings.embed_model = embed_model
    Settings.node_parser = node_parser

    # Create the sentence window index and write it to save_dir
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents([document])
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir))

    return sentence_index


sentence_index = build_sentence_window_index(
    document,
    llm=Settings.llm,
    embed_model=Settings.embed_model,
    #embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="sentence_index"
)


from sentence_transformers import SentenceTransformer

def get_sentence_window_query_engine(
    sentence_index,
    similarity_top_k=6,
    rerank_top_n=2,
):
    # Define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

sentence_window_engine = get_sentence_window_query_engine(sentence_index)


window_response = sentence_window_engine.query(
    "What's the difference between a maker's schedule and a manager's schedule?"
)
print(str(window_response))

The difference between a maker's schedule and a manager's schedule lies in how time is structured and utilized. The manager's schedule is divided into one-hour intervals, allowing for frequent task changes and meetings. In contrast, the maker's schedule prefers larger blocks of time, like half a day, to focus on deep work without interruptions. Meetings are disruptive for those on the maker's schedule as they break the flow of work, while they are more manageable for those on the manager's schedule who are accustomed to frequent task switching.


#tru.reset_database()

tru_recorder_sentence_window = get_prebuilt_trulens_recorder(
    sentence_window_engine,
    app_id = "Sentence Window Query Engine"
)


for question in eval_questions:
    with tru_recorder_sentence_window as recording:
        response = sentence_window_engine.query(question)
        print(question)
        print(str(response))

What's a good way for someone with kids to fund a startup?
A good way for someone with kids to fund a startup is to keep expenses low and aim to become profitable as soon as possible. This approach, known as being "ramen profitable," involves minimizing costs and focusing on generating enough revenue to cover basic living expenses. By adopting this strategy, individuals can reduce their reliance on external funding and demonstrate financial discipline to potential investors.

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

What are the ten reasons why the US has such a high concentration of startups?
1. The US allows immigration, attracting smart and ambitious individuals.
2. The US is a rich country, providing resources and infrastructure for startups.
3. The US is not (yet) a police state, allowing for freedom of thought and innovation.
4. The US has a dynamic approach to career paths, fostering a culture of risk-taking and entrepreneurship.
5. American universities, particularly those like Stanford, play a significant role in nurturing startups.
6. The US has a history of embracing ambition and risk-taking, key traits for startup success.
7. American attitudes towards ambition and risk are more accepting compared to some other regions.
8. The US has a culture that encourages individuals to pursue unconventional career paths.
9. The US has a large pool of venture capital funding, supporting the growth of startups.
10. The US has a network of successful startup founders who serve as examples and mentors for aspiring entrepreneurs.

Groundedness per statement in source:   0%|          | 0/20 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/20 [00:00<?, ?it/s]

What are some ways to avoid copying the wrong thing?
Avoid copying the wrong thing by only imitating what you genuinely like. This approach can help in distinguishing between things you admire and things you truly enjoy. Another method is to pay attention to guilty pleasures and identify what you genuinely like when not influenced by external factors. Additionally, when copying good things, be cautious to replicate their strengths rather than their flaws, as flaws can be easier to imitate but may not contribute to the overall quality.

Groundedness per statement in source:   0%|          | 0/4 [00:00<?, ?it/s]

Groundedness per statement in source:   0%|          | 0/4 [00:00<?, ?it/s]

What is 'good procrastination' according to Paul Graham?
Good procrastination, according to Paul Graham, involves avoiding errands in order to focus on real work.


tru.get_leaderboard(app_ids=[])


# # launches on http://localhost:8501/
# tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Submit this IP Address: 34.73.99.55

<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

	app_id	app_json	type	record_id	input	output	tags	record_json	cost_json	perf_json	ts	Answer Relevance	Context Relevance	Groundedness	Answer Relevance_calls	Context Relevance_calls	Groundedness_calls	latency	total_tokens	total_cost
0	Direct Query Engine	{"tru_class_info": {"name": "TruLlama", "modul...	RetrieverQueryEngine(llama_index.core.query_en...	record_hash_32e79f96e52ff5d0802975da2bb02a5c	"What's a good way for someone with kids to fu...	"A good way for someone with kids to fund a st...	-	{"record_id": "record_hash_32e79f96e52ff5d0802...	{"n_requests": 2, "n_successful_requests": 2, ...	{"start_time": "2024-04-11T21:18:46.413592", "...	2024-04-11T21:18:51.099753	0.8	0.6	0.333333	[{'args': {'prompt': 'What's a good way for so...	[{'args': {'prompt': 'What's a good way for so...	[{'args': {'source': 'Friends and Family ...	4	2177	0.003288
1	Direct Query Engine	{"tru_class_info": {"name": "TruLlama", "modul...	RetrieverQueryEngine(llama_index.core.query_en...	record_hash_37d1f6df6d07d4963375c48744643586	"What are the ten reasons why the US has such ...	"The US has a high concentration of startups d...	-	{"record_id": "record_hash_37d1f6df6d07d496337...	{"n_requests": 2, "n_successful_requests": 2, ...	{"start_time": "2024-04-11T21:18:51.833109", "...	2024-04-11T21:18:55.760902	0.9	0.9	0.000000	[{'args': {'prompt': 'What are the ten reasons...	[{'args': {'prompt': 'What are the ten reasons...	[{'args': {'source': 'But the worst problem in...	3	2166	0.003260
2	Direct Query Engine	{"tru_class_info": {"name": "TruLlama", "modul...	RetrieverQueryEngine(llama_index.core.query_en...	record_hash_14ce1dbcd4a3a4644a3c03c34f33371f	"What are some ways to avoid copying the wrong...	"Avoid copying the wrong thing by copying only...	-	{"record_id": "record_hash_14ce1dbcd4a3a4644a3...	{"n_requests": 2, "n_successful_requests": 2, ...	{"start_time": "2024-04-11T21:18:57.140698", "...	2024-04-11T21:19:02.415636	0.8	1.0	1.000000	[{'args': {'prompt': 'What are some ways to av...	[{'args': {'prompt': 'What are some ways to av...	[{'args': {'source': '# Copy What You Like []...	5	2140	0.003226
3	Direct Query Engine	{"tru_class_info": {"name": "TruLlama", "modul...	RetrieverQueryEngine(llama_index.core.query_en...	record_hash_eacc0748cb27656252c9d6d783ebefc9	"What is 'good procrastination' according to P...	"Good procrastination, according to Paul Graha...	-	{"record_id": "record_hash_eacc0748cb27656252c...	{"n_requests": 2, "n_successful_requests": 2, ...	{"start_time": "2024-04-11T21:19:03.141396", "...	2024-04-11T21:19:07.158204	1.0	1.0	0.550000	[{'args': {'prompt': 'What is 'good procrastin...	[{'args': {'prompt': 'What is 'good procrastin...	[{'args': {'source': 'To the extent this means...	4	2153	0.003237
4	Direct Query Engine	{"tru_class_info": {"name": "TruLlama", "modul...	RetrieverQueryEngine(llama_index.core.query_en...	record_hash_dd634911d2fc283b5e8ff65293b74fc4	"What are the 18 reasons startups fail accordi...	"There are several reasons startups fail accor...	-	{"record_id": "record_hash_dd634911d2fc283b5e8...	{"n_requests": 2, "n_successful_requests": 2, ...	{"start_time": "2024-04-11T21:19:07.901592", "...	2024-04-11T21:19:11.666227	0.2	0.5	0.600000	[{'args': {'prompt': 'What are the 18 reasons ...	[{'args': {'prompt': 'What are the 18 reasons ...	[{'args': {'source': '\[[13](#f13n)\] If ...	3	2039	0.003064

	Answer Relevance	Context Relevance	Groundedness	latency	total_cost
app_id
Sentence Window Query Engine	0.81	0.63	0.756333	3.8	0.003088
Direct Query Engine	0.66	0.67	0.548333	3.8	0.003179

Sentence Window Retrieval for Enhanced Context¶

Attribution¶

Why should you read this notebook?¶

Motivation & Main Idea¶

Summary of Results¶

Set up¶

Install dependencies¶

Load data¶

Basic RAG pipeline¶

Configure embedding model and LLM¶

Create index and query engine¶

Run test query¶

Evaluation¶

Evaluation questions¶

RAG Triad¶

Set up TruLens evals¶

Advanced RAG pipeline¶

Sentence Window retrieval¶

Evaluation¶