%pip install -qU PyGithub langchain requests openai python-dotenv

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 354.4/354.4 kB 7.2 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 311.6/311.6 kB 30.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 856.7/856.7 kB 33.3 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 291.3/291.3 kB 28.9 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 64.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 75.6/75.6 kB 7.1 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 77.9/77.9 kB 9.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.3/58.3 kB 4.0 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 115.5/115.5 kB 12.0 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.0/53.0 kB 5.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 141.1/141.1 kB 15.1 MB/s eta 0:00:00


import warnings
warnings.filterwarnings('ignore')


import os
from dotenv import load_dotenv,find_dotenv

# # Load tokens from local .env file
# load_dotenv(find_dotenv())

# Or set them like this
os.environ["OPENAI_API_KEY"] = "sk-..."
os.environ["CIRCLE_TOKEN"] = "CCIPRJ_..."
os.environ["GH_TOKEN"] = "github_pat_..."

## Print key to check
# print(os.getenv["OPENAI_API_KEY"])
# print(os.getenv["CIRCLE_TOKEN"])
# print(os.getenv["GH_TOKEN"])


human_template  = "{question}"


quiz_bank = """1. Subject: Leonardo DaVinci
   Categories: Art, Science
   Facts:
    - Painted the Mona Lisa
    - Studied zoology, anatomy, geology, optics
    - Designed a flying machine

2. Subject: Paris
   Categories: Art, Geography
   Facts:
    - Location of the Louvre, the museum where the Mona Lisa is displayed
    - Capital of France
    - Most populous city in France
    - Where Radium and Polonium were discovered by scientists Marie and Pierre Curie

3. Subject: Telescopes
   Category: Science
   Facts:
    - Device to observe different objects
    - The first refracting telescopes were invented in the Netherlands in the 17th Century
    - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror

4. Subject: Starry Night
   Category: Art
   Facts:
    - Painted by Vincent van Gogh in 1889
    - Captures the east-facing view of van Gogh's room in Saint-Rémy-de-Provence

5. Subject: Physics
   Category: Science
   Facts:
    - The sun doesn't change color during sunset.
    - Water slows the speed of light
    - The Eiffel Tower in Paris is taller in the summer than the winter due to expansion of the metal."""


delimiter = "####"

prompt_template = f"""
Follow these steps to generate a customized quiz for the user.
The question will be delimited with four hashtags i.e {delimiter}

The user will provide a category that they want to create a quiz for. Any questions included in the quiz
should only refer to the category.

Step 1:{delimiter} First identify the category user is asking about from the following list:
* Geography
* Science
* Art

Step 2:{delimiter} Determine the subjects to generate questions about. The list of topics are below:

{quiz_bank}

Pick up to two subjects that fit the user's category.

Step 3:{delimiter} Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject.

Use the following format for the quiz:
Question 1:{delimiter} <question 1>

Question 2:{delimiter} <question 2>

Question 3:{delimiter} <question 3>

"""


from langchain.prompts import ChatPromptTemplate
chat_prompt = ChatPromptTemplate.from_messages([("human", prompt_template)])


# Check the prompt
chat_prompt

ChatPromptTemplate(input_variables=[], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template="\nFollow these steps to generate a customized quiz for the user.\nThe question will be delimited with four hashtags i.e ####\n\nThe user will provide a category that they want to create a quiz for. Any questions included in the quiz\nshould only refer to the category.\n\nStep 1:#### First identify the category user is asking about from the following list:\n* Geography\n* Science\n* Art\n\nStep 2:#### Determine the subjects to generate questions about. The list of topics are below:\n\n1. Subject: Leonardo DaVinci\n   Categories: Art, Science\n   Facts:\n    - Painted the Mona Lisa\n    - Studied zoology, anatomy, geology, optics\n    - Designed a flying machine\n\n2. Subject: Paris\n   Categories: Art, Geography\n   Facts:\n    - Location of the Louvre, the museum where the Mona Lisa is displayed\n    - Capital of France\n    - Most populous city in France\n    - Where Radium and Polonium were discovered by scientists Marie and Pierre Curie\n\n3. Subject: Telescopes\n   Category: Science\n   Facts:\n    - Device to observe different objects\n    - The first refracting telescopes were invented in the Netherlands in the 17th Century\n    - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror\n\n4. Subject: Starry Night\n   Category: Art\n   Facts:\n    - Painted by Vincent van Gogh in 1889\n    - Captures the east-facing view of van Gogh's room in Saint-Rémy-de-Provence\n\n5. Subject: Physics\n   Category: Science\n   Facts:\n    - The sun doesn't change color during sunset.\n    - Water slows the speed of light\n    - The Eiffel Tower in Paris is taller in the summer than the winter due to expansion of the metal.\n\nPick up to two subjects that fit the user's category.\n\nStep 3:#### Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject.\n\nUse the following format for the quiz:\nQuestion 1:#### <question 1>\n\nQuestion 2:#### <question 2>\n\nQuestion 3:#### <question 3>\n\n"))])


from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
llm


# Parse LLM output and convert to string
from langchain.schema.output_parser import StrOutputParser
output_parser = StrOutputParser()
output_parser

StrOutputParser()


chain = chat_prompt | llm | output_parser
chain

ChatPromptTemplate(input_variables=[], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template="\nFollow these steps to generate a customized quiz for the user.\nThe question will be delimited with four hashtags i.e ####\n\nThe user will provide a category that they want to create a quiz for. Any questions included in the quiz\nshould only refer to the category.\n\nStep 1:#### First identify the category user is asking about from the following list:\n* Geography\n* Science\n* Art\n\nStep 2:#### Determine the subjects to generate questions about. The list of topics are below:\n\n1. Subject: Leonardo DaVinci\n   Categories: Art, Science\n   Facts:\n    - Painted the Mona Lisa\n    - Studied zoology, anatomy, geology, optics\n    - Designed a flying machine\n\n2. Subject: Paris\n   Categories: Art, Geography\n   Facts:\n    - Location of the Louvre, the museum where the Mona Lisa is displayed\n    - Capital of France\n    - Most populous city in France\n    - Where Radium and Polonium were discovered by scientists Marie and Pierre Curie\n\n3. Subject: Telescopes\n   Category: Science\n   Facts:\n    - Device to observe different objects\n    - The first refracting telescopes were invented in the Netherlands in the 17th Century\n    - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror\n\n4. Subject: Starry Night\n   Category: Art\n   Facts:\n    - Painted by Vincent van Gogh in 1889\n    - Captures the east-facing view of van Gogh's room in Saint-Rémy-de-Provence\n\n5. Subject: Physics\n   Category: Science\n   Facts:\n    - The sun doesn't change color during sunset.\n    - Water slows the speed of light\n    - The Eiffel Tower in Paris is taller in the summer than the winter due to expansion of the metal.\n\nPick up to two subjects that fit the user's category.\n\nStep 3:#### Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject.\n\nUse the following format for the quiz:\nQuestion 1:#### <question 1>\n\nQuestion 2:#### <question 2>\n\nQuestion 3:#### <question 3>\n\n"))])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x788ac4000310>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x788ac40020e0>, temperature=0.0, openai_api_key='sk-...', openai_proxy='')
| StrOutputParser()


# Combine previous steps into one simple function
def assistant_chain(
    system_message,
    human_template="{question}",
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    output_parser=StrOutputParser()):

  chat_prompt = ChatPromptTemplate.from_messages([
      ("system", system_message),
      ("human", human_template),
  ])
  return chat_prompt | llm | output_parser


def eval_expected_words(
    system_message,
    question,
    expected_words,
    human_template="{question}",
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    output_parser=StrOutputParser()):

  assistant = assistant_chain(
      system_message,
      human_template,
      llm,
      output_parser)


  answer = assistant.invoke({"question": question})

  print(answer)

  assert any(word in answer.lower() \
             for word in expected_words), \
    f"Expected the assistant questions to include \
    '{expected_words}', but it did not"


question  = "Generate a quiz about science."
expected_words = ["davinci", "telescope", "physics", "curie"]


eval_expected_words(
    prompt_template,
    question,
    expected_words
)

Step 1:#### First identify the category user is asking about from the following list:
* Geography
* Science
* Art

Step 2:#### Determine the subjects to generate questions about. The list of topics are below:

3. Subject: Telescopes
   Category: Science
   Facts:
    - Device to observe different objects
    - The first refracting telescopes were invented in the Netherlands in the 17th Century
    - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror

5. Subject: Physics
   Category: Science
   Facts:
    - The sun doesn't change color during sunset.
    - Water slows the speed of light
    - The Eiffel Tower in Paris is taller in the summer than the winter due to expansion of the metal.

Step 3:#### Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject.

Question 1:#### What is the purpose of a telescope?
Question 2:#### In which century were the first refracting telescopes invented and where?
Question 3:#### Why is the Eiffel Tower in Paris taller in the summer than in the winter?


def evaluate_refusal(
    system_message,
    question,
    decline_response,
    human_template="{question}",
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    output_parser=StrOutputParser()):

  assistant = assistant_chain(human_template,
                              system_message,
                              llm,
                              output_parser)

  answer = assistant.invoke({"question": question})
  print(answer)

  assert decline_response.lower() in answer.lower(), \
    f"Expected the bot to decline with \
    '{decline_response}' got {answer}"


question  = "Generate a quiz about Rome."
decline_response = "I'm sorry"


evaluate_refusal(
    prompt_template,
    question,
    decline_response
)

#### First identify the category user is asking about from the following list:
* Geography
* Science
* Art

#### Geography

#### Science

Step 2:#### Determine the subjects to generate questions about. The list of topics are below:

2. Subject: Paris
   Categories: Art, Geography
   Facts:
    - Location of the Louvre, the museum where the Mona Lisa is displayed
    - Capital of France
    - Most populous city in France
    - Where Radium and Polonium were discovered by scientists Marie and Pierre Curie

3. Subject: Telescopes
   Category: Science
   Facts:
    - Device to observe different objects
    - The first refracting telescopes were invented in the Netherlands in the 17th Century
    - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror

Step 3:#### Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject.

Question 1:#### In which city is the Louvre located, the museum where the Mona Lisa is displayed?
a) London
b) Paris
c) Rome
d) Madrid

Question 2:#### What is the capital of France?
a) Berlin
b) Paris
c) London
d) Rome

Question 3:#### Who discovered Radium and Polonium in Paris?
a) Isaac Newton
b) Marie and Pierre Curie
c) Galileo Galilei
d) Albert Einstein

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-21-e954c1262350> in <cell line: 1>()
----> 1 evaluate_refusal(
      2     prompt_template,
      3     question,
      4     decline_response
      5 )

<ipython-input-19-cc8b6a571fa4> in evaluate_refusal(system_message, question, decline_response, human_template, llm, output_parser)
     15   print(answer)
     16 
---> 17   assert decline_response.lower() in answer.lower(), \
     18     f"Expected the bot to decline with \
     19     '{decline_response}' got {answer}"

AssertionError: Expected the bot to decline with     'I'm sorry' got #### First identify the category user is asking about from the following list:
* Geography
* Science
* Art

#### Geography

#### Science

Step 2:#### Determine the subjects to generate questions about. The list of topics are below:

2. Subject: Paris
   Categories: Art, Geography
   Facts:
    - Location of the Louvre, the museum where the Mona Lisa is displayed
    - Capital of France
    - Most populous city in France
    - Where Radium and Polonium were discovered by scientists Marie and Pierre Curie

3. Subject: Telescopes
   Category: Science
   Facts:
    - Device to observe different objects
    - The first refracting telescopes were invented in the Netherlands in the 17th Century
    - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror

Step 3:#### Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject.

Question 1:#### In which city is the Louvre located, the museum where the Mona Lisa is displayed?
a) London
b) Paris
c) Rome
d) Madrid

Question 2:#### What is the capital of France?
a) Berlin
b) Paris
c) London
d) Rome

Question 3:#### Who discovered Radium and Polonium in Paris?
a) Isaac Newton
b) Marie and Pierre Curie
c) Galileo Galilei
d) Albert Einstein


%%writefile app.py
from langchain.prompts                import ChatPromptTemplate
from langchain.chat_models            import ChatOpenAI
from langchain.schema.output_parser   import StrOutputParser

delimiter = "####"

quiz_bank = """1. Subject: Leonardo DaVinci
   Categories: Art, Science
   Facts:
    - Painted the Mona Lisa
    - Studied zoology, anatomy, geology, optics
    - Designed a flying machine

2. Subject: Paris
   Categories: Art, Geography
   Facts:
    - Location of the Louvre, the museum where the Mona Lisa is displayed
    - Capital of France
    - Most populous city in France
    - Where Radium and Polonium were discovered by scientists Marie and Pierre Curie

3. Subject: Telescopes
   Category: Science
   Facts:
    - Device to observe different objects
    - The first refracting telescopes were invented in the Netherlands in the 17th Century
    - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror

4. Subject: Starry Night
   Category: Art
   Facts:
    - Painted by Vincent van Gogh in 1889
    - Captures the east-facing view of van Gogh's room in Saint-Rémy-de-Provence

5. Subject: Physics
   Category: Science
   Facts:
    - The sun doesn't change color during sunset.
    - Water slows the speed of light
    - The Eiffel Tower in Paris is taller in the summer than the winter due to expansion of the metal.
"""

system_message = f"""
Follow these steps to generate a customized quiz for the user.
The question will be delimited with four hashtags i.e {delimiter}

The user will provide a category that they want to create a quiz for. Any questions included in the quiz
should only refer to the category.

Step 1:{delimiter} First identify the category user is asking about from the following list:
* Geography
* Science
* Art

Step 2:{delimiter} Determine the subjects to generate questions about. The list of topics are below:

{quiz_bank}

Pick up to two subjects that fit the user's category.

Step 3:{delimiter} Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject.

Use the following format for the quiz:
Question 1:{delimiter} <question 1>

Question 2:{delimiter} <question 2>

Question 3:{delimiter} <question 3>

Additional rules:

- Only include questions from information in the quiz bank. Students only know answers to questions from the quiz bank, do not ask them about other topics.
- Only use explicit matches for the category, if the category is not an exact match to categories in the quiz bank, answer that you do not have information.
- If the user asks a question about a subject you do not have information about in the quiz bank, answer "I'm sorry I do not have information about that".
"""

"""
  Helper functions for writing the test cases
"""

def assistant_chain(
    system_message=system_message,
    human_template="{question}",
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    output_parser=StrOutputParser()):

  chat_prompt = ChatPromptTemplate.from_messages([
      ("system", system_message),
      ("human", human_template),
  ])
  return chat_prompt | llm | output_parser

Writing app.py


%%writefile test_assistant.py
from app import assistant_chain
from app import system_message
from langchain.prompts                import ChatPromptTemplate
from langchain.chat_models            import ChatOpenAI
from langchain.schema.output_parser   import StrOutputParser

import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

def eval_expected_words(
    system_message,
    question,
    expected_words,
    human_template="{question}",
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    output_parser=StrOutputParser()):

  assistant = assistant_chain(system_message)
  answer = assistant.invoke({"question": question})
  print(answer)

  assert any(word in answer.lower() \
             for word in expected_words), \
    f"Expected the assistant questions to include \
    '{expected_words}', but it did not"

def evaluate_refusal(
    system_message,
    question,
    decline_response,
    human_template="{question}",
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    output_parser=StrOutputParser()):

  assistant = assistant_chain(human_template,
                              system_message,
                              llm,
                              output_parser)

  answer = assistant.invoke({"question": question})
  print(answer)

  assert decline_response.lower() in answer.lower(), \
    f"Expected the bot to decline with \
    '{decline_response}' got {answer}"

"""
  Test cases
"""

def test_science_quiz():

  question  = "Generate a quiz about science."
  expected_subjects = ["davinci", "telescope", "physics", "curie"]
  eval_expected_words(
      system_message,
      question,
      expected_subjects)

def test_geography_quiz():
  question  = "Generate a quiz about geography."
  expected_subjects = ["paris", "france", "louvre"]
  eval_expected_words(
      system_message,
      question,
      expected_subjects)

def test_refusal_rome():
  question  = "Help me create a quiz about Rome"
  decline_response = "I'm sorry"
  evaluate_refusal(
      system_message,
      question,
      decline_response)

Writing test_assistant.py


%%writefile circleci_config.yml
version: 2.1
orbs:
  # The python orb contains a set of prepackaged circleci configuration you can use repeatedly in your configurations files
  # Orb commands and jobs help you with common scripting around a language/tool
  # so you dont have to copy and paste it everywhere.
  # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python
  python: circleci/python@2.1.1

parameters:
  eval-mode:
    type: string
    default: "commit"

workflows:
  evaluate-commit:
    when:
      equal: [ commit, << pipeline.parameters.eval-mode >> ]
    jobs:
      - run-commit-evals:
          context:
            - dl-ai-courses
  evaluate-release:
    when:
      equal: [ release, << pipeline.parameters.eval-mode >> ]
    jobs:
      - run-pre-release-evals:
          context:
            - dl-ai-courses
  evaluate-all:
    when:
      equal: [ full, << pipeline.parameters.eval-mode >> ]
    jobs:
      - run-manual-evals:
          context:
            - dl-ai-courses
  report-evals:
    when:
      equal: [ report, << pipeline.parameters.eval-mode >> ]
    jobs:
      - store-eval-artifacts:
          context:
            - dl-ai-courses

jobs:
  run-commit-evals:  # This is the name of the job, feel free to change it to better match what you're trying to do!
    # These next lines defines a docker executors: https://circleci.com/docs/2.0/executor-types/
    # You can specify an image from dockerhub or use one of the convenience images from CircleCI's Developer Hub
    # A list of available CircleCI docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python
    # The executor is the environment in which the steps below will be executed - below will use a python 3.9 container
    # Change the version below to your required version of python
    docker:
      - image: cimg/python:3.10.5
    # Checkout the code as the first step. This is a dedicated CircleCI step.
    # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default.
    # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt.
    # Then run your tests!
    # CircleCI will report the results back to your VCS provider.
    steps:
      - checkout
      - python/install-packages:
          pkg-manager: pip
          # app-dir: ~/project/package-directory/  # If your requirements.txt isn't in the root directory.
          # pip-dependency-file: test-requirements.txt  # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements.
      - run:
          name: Run assistant evals.
          command: python -m pytest --junitxml results.xml test_assistant.py
      - store_test_results:
          path: results.xml
  run-pre-release-evals:
    docker:
      - image: cimg/python:3.10.5
    steps:
      - checkout
      - python/install-packages:
          pkg-manager: pip
          # app-dir: ~/project/package-directory/  # If your requirements.txt isn't in the root directory.
          # pip-dependency-file: test-requirements.txt  # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements.
      - run:
          name: Run release evals.
          command: python -m pytest --junitxml results.xml test_release_evals.py
      - store_test_results:
          path: results.xml
  run-manual-evals:
    docker:
      - image: cimg/python:3.10.5
    steps:
      - checkout
      - python/install-packages:
          pkg-manager: pip
          # app-dir: ~/project/package-directory/  # If your requirements.txt isn't in the root directory.
          # pip-dependency-file: test-requirements.txt  # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements.
      - run:
          name: Run end to end evals.
          command: python -m pytest --junitxml results.xml test_assistant.py test_release_evals.py
      - store_test_results:
          path: results.xml
  store-eval-artifacts:
    docker:
      - image: cimg/python:3.10.5
    steps:
      - checkout
      - python/install-packages:
          pkg-manager: pip
          # app-dir: ~/project/package-directory/  # If your requirements.txt isn't in the root directory.
          # pip-dependency-file: test-requirements.txt  # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements.
      - run:
          name: Save eval to html file
          command: python save_eval_artifacts.py
      - store_artifacts:
          path: /tmp/eval_results.html
          destination: eval_results.html

Writing circleci_config.yml


%%writefile requirements.txt
aiohttp==3.8.6
aiosignal==1.3.1
annotated-types==0.6.0
anyio==3.7.1
appnope==0.1.3
asttokens==2.4.0
async-timeout==4.0.3
attrs==23.1.0
backcall==0.2.0
certifi==2023.7.22
cffi==1.16.0
charset-normalizer==3.3.0
comm==0.1.4
cryptography==41.0.5
dataclasses-json==0.6.1
debugpy==1.8.0
decorator==5.1.1
Deprecated==1.2.14
executing==2.0.0
frozenlist==1.4.0
idna==3.4
iniconfig==2.0.0
ipykernel==6.25.2
ipytest==0.13.3
ipython==8.16.1
jedi==0.19.1
jsonpatch==1.33
jsonpointer==2.4
jupyter==1.0.0
jupyter-console==6.6.3
jupyter-events==0.9.0
jupyter-lsp==2.2.0
jupyter_client==8.4.0
jupyter_core==5.4.0
jupyter_server==2.10.1
jupyter_server_terminals==0.4.4
jupyterlab==4.0.8
jupyterlab-pygments==0.2.2
jupyterlab-widgets==3.0.9
jupyterlab_server==2.25.1
langchain==0.0.326
langsmith==0.0.54
marshmallow==3.20.1
matplotlib-inline==0.1.6
multidict==6.0.4
mypy-extensions==1.0.0
nest-asyncio==1.5.8
numpy==1.26.1
openai==0.28.1
packaging==23.2
pandas==2.1.4
parso==0.8.3
pexpect==4.8.0
pickleshare==0.7.5
platformdirs==3.11.0
pluggy==1.3.0
prompt-toolkit==3.0.39
psutil==5.9.6
ptyprocess==0.7.0
pure-eval==0.2.2
pycparser==2.21
pydantic==2.4.2
pydantic_core==2.10.1
PyGithub==2.1.1
Pygments==2.16.1
PyJWT==2.8.0
PyNaCl==1.5.0
pytest==7.4.3
python-dateutil==2.8.2
python-dotenv==1.0.0
PyYAML==6.0.1
pyzmq==25.1.1
requests==2.31.0
six==1.16.0
sniffio==1.3.0
SQLAlchemy==2.0.22
stack-data==0.6.3
tenacity==8.2.3
tornado==6.3.3
tqdm==4.66.1
traitlets==5.11.2
typing-inspect==0.9.0
typing_extensions==4.8.0
urllib3==2.0.7
wcwidth==0.2.8
wrapt==1.15.0
yarl==1.9.2

Writing requirements.txt


from github import Github, Auth, InputGitTreeElement
import time
import random
import asyncio

def _create_tree_element(repo, path, content):
    blob = repo.create_git_blob(content, "utf-8")
    element = InputGitTreeElement(
        path=path, mode="100644", type="blob", sha=blob.sha
    )
    return element

def push_files_to_github(repo_name, branch_name, files):
    files_to_push = set(files)

    # Authenticate to GitHub
    auth = Auth.Token(os.getenv("GH_TOKEN"))
    g = Github(auth=auth)
    repo = g.get_repo(repo_name)

    # Take the files we defined and create tree elements for them for building
    # a git tree
    elements = []
    config_element = _create_tree_element(
        repo, ".circleci/config.yml", open("circleci_config.yml").read()
    )
    elements.append(config_element)

    requirements_element = _create_tree_element(
        repo, "requirements.txt", open("requirements.txt").read()
    )
    elements.append(requirements_element)
    for file in files_to_push:
        print(f"uploading {file}")
        with open(file, encoding="utf-8") as f:
            content = f.read()
            element = _create_tree_element(repo, file, content)
            elements.append(element)

    head_sha = repo.get_branch("main").commit.sha

    print(f"pushing files to: {branch_name}")
    try:
        repo.create_git_ref(ref=f"refs/heads/{branch_name}", sha=head_sha)
        time.sleep(2)
    except Exception as _:
        print(f"{branch_name} already exists in the repository pushing updated changes")
    branch_sha = repo.get_branch(branch_name).commit.sha

    base_tree = repo.get_git_tree(sha=branch_sha)
    tree = repo.create_git_tree(elements, base_tree)
    parent = repo.get_git_commit(sha=branch_sha)
    commit = repo.create_git_commit("Trigger CI evaluation pipeline", tree, [parent])
    branch_refs = repo.get_git_ref(f"heads/{branch_name}")
    branch_refs.edit(sha=commit.sha)


## Verify that your token is configured properly
# auth = Auth.Token(os.getenv("GH_TOKEN"))
# g = Github(auth=auth)
# for repo in g.get_user().get_repos():
#     print(repo.name)


push_files_to_github(
    repo_name="gadkins/llmops-evals-example",
    branch_name="main",
    files=["app.py", "test_assistant.py"]
)

uploading app.py
uploading test_assistant.py
pushing files to: main
main already exists in the repository pushing updated changes


import requests

# Use this function if your CircleCI project was configured via GitLab or GitHub
# App. It uses `circleci` instead of `gh`, org ID in place of org name, and
# Project ID in place of repo name
def _trigger_circle_pipline(org_id, project_id, branch, token, params=None):
    params = {} if params is None else params
    r = requests.post(
        f"{os.getenv('DLAI_CIRCLE_CI_API_BASE', 'https://circleci.com')}/api/v2/project/circleci/{org_id}/{project_id}/pipeline",
        headers={"Circle-Token": f"{token}", "accept": "application/json"},
        json={"branch": branch, "parameters": params},
    )
    pipeline_data = r.json()
    pipeline_number = pipeline_data["number"]
    print(
        f"Please visit https://app.circleci.com/pipelines/github/{repo_name}/{pipeline_number}"
    )

# def _trigger_circle_pipline(repo_name, branch, token, params=None):
#     params = {} if params is None else params
#     r = requests.post(
#         f"{os.getenv('DLAI_CIRCLE_CI_API_BASE', 'https://circleci.com')}/api/v2/project/gh/{repo_name}/pipeline",
#         headers={"Circle-Token": f"{token}", "accept": "application/json"},
#         json={"branch": branch, "parameters": params},
#     )
#     pipeline_data = r.json()
#     pipeline_number = pipeline_data["number"]
#     print(
#         f"Please visit https://app.circleci.com/pipelines/github/{repo_name}/{pipeline_number}"
#     )

def trigger_commit_evals(org_id, project_id, branch, token):
    _trigger_circle_pipline(org_id, project_id, branch, token, {"eval-mode": "commit"})


# trigger_commit_evals(
#     org_id="8b9c2f30-d3d2-498a-b9f2-3473dd9c21c3",
#     project_id="e58e4e69-2e88-43d2-95d2-029108cf6957",
#     branch="main",
#     token=os.getenv("CIRCLE_TOKEN")
# )


delimiter = "####"


eval_system_prompt = f"""You are an assistant that evaluates \
  whether or not an assistant is producing valid quizzes.
  The assistant should be producing output in the \
  format of Question N:{delimiter} <question N>?"""


llm_response = """
Question 1:#### What is the largest telescope in space called and what material is its mirror made of?

Question 2:#### True or False: Water slows down the speed of light.

Question 3:#### What did Marie and Pierre Curie discover in Paris?
"""


eval_user_message = f"""You are evaluating a generated quiz \
based on the context that the assistant uses to create the quiz.
  Here is the data:
    [BEGIN DATA]
    ************
    [Response]: {llm_response}
    ************
    [END DATA]

Read the response carefully and determine if it looks like \
a quiz or test. Do not evaluate if the information is correct
only evaluate if the data is in the expected format.

Output Y if the response is a quiz, \
output N if the response does not look like a quiz.
"""


from langchain.prompts import ChatPromptTemplate
eval_prompt = ChatPromptTemplate.from_messages([
      ("system", eval_system_prompt),
      ("human", eval_user_message),
  ])


from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo",
                 temperature=0)


from langchain.schema.output_parser import StrOutputParser
output_parser = StrOutputParser()


eval_chain = eval_prompt | llm | output_parser


eval_chain.invoke({})

'Y'


def create_eval_chain(
    agent_response,
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    output_parser=StrOutputParser()
):
  delimiter = "####"
  eval_system_prompt = f"""You are an assistant that evaluates whether or not an assistant is producing valid quizzes.
  The assistant should be producing output in the format of Question N:{delimiter} <question N>?"""

  eval_user_message = f"""You are evaluating a generated quiz based on the context that the assistant uses to create the quiz.
  Here is the data:
    [BEGIN DATA]
    ************
    [Response]: {agent_response}
    ************
    [END DATA]

Read the response carefully and determine if it looks like a quiz or test. Do not evaluate if the information is correct
only evaluate if the data is in the expected format.

Output Y if the response is a quiz, output N if the response does not look like a quiz.
"""
  eval_prompt = ChatPromptTemplate.from_messages([
      ("system", eval_system_prompt),
      ("human", eval_user_message),
  ])

  return eval_prompt | llm | output_parser


known_bad_result = "There are lots of interesting facts. Tell me more about what you'd like to know"


bad_eval_chain = create_eval_chain(known_bad_result)


# response for wrong prompt
bad_eval_chain.invoke({})

'N'


%%writefile test_release_evals.py
from app import assistant_chain
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
import pytest


def create_eval_chain(
    agent_response,
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    output_parser=StrOutputParser(),
):
    delimiter = "####"
    eval_system_prompt = f"""You are an assistant that evaluates whether or not an assistant is producing valid quizzes.
  The assistant should be producing output in the format of Question N:{delimiter} <question N>?"""

    eval_user_message = f"""You are evaluating a generated quiz based on the context that the assistant uses to create the quiz.
  Here is the data:
    [BEGIN DATA]
    ************
    [Response]: {agent_response}
    ************
    [END DATA]

Read the response carefully and determine if it looks like a quiz or test. Do not evaluate if the information is correct
only evaluate if the data is in the expected format.

Output Y if the response is a quiz, output N if the response does not look like a quiz.
"""
    eval_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", eval_system_prompt),
            ("human", eval_user_message),
        ]
    )

    return eval_prompt | llm | output_parser


@pytest.fixture
def known_bad_result():
    return "There are lots of interesting facts. Tell me more about what you'd like to know"


@pytest.fixture
def quiz_request():
    return "Give me a quiz about Geography"


def test_model_graded_eval(quiz_request):
    assistant = assistant_chain()
    result = assistant.invoke({"question": quiz_request})
    print(result)
    eval_agent = create_eval_chain(result)
    eval_response = eval_agent.invoke({})
    assert eval_response == "Y"


def test_model_graded_eval_should_fail(known_bad_result):
    print(known_bad_result)
    eval_agent = create_eval_chain(known_bad_result)
    eval_response = eval_agent.invoke({})
    assert (
        eval_response == "Y"
    ), f"expected failure, asserted the response should be 'Y', \
    got back '{eval_response}'"

Writing test_release_evals.py


push_files_to_github(
    repo_name="gadkins/llmops-evals-example",
    branch_name="main",
    files=["app.py", "test_assistant.py", "test_release_evals.py"]
)

uploading app.py
uploading test_assistant.py
uploading test_release_evals.py
pushing files to: main
main already exists in the repository pushing updated changes


%%writefile quiz_bank.txt
1. Subject: Leonardo DaVinci
   Categories: Art, Science
   Facts:
    - Painted the Mona Lisa
    - Studied zoology, anatomy, geology, optics
    - Designed a flying machine

2. Subject: Paris
   Categories: Art, Geography
   Facts:
    - Location of the Louvre, the museum where the Mona Lisa is displayed
    - Capital of France
    - Most populous city in France
    - Where Radium and Polonium were discovered by scientists Marie and Pierre Curie

3. Subject: Telescopes
   Category: Science
   Facts:
    - Device to observe different objects
    - The first refracting telescopes were invented in the Netherlands in the 17th Century
    - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror

4. Subject: Starry Night
   Category: Art
   Facts:
    - Painted by Vincent van Gogh in 1889
    - Captures the east-facing view of van Gogh's room in Saint-Rémy-de-Provence

5. Subject: Physics
   Category: Science
   Facts:
    - The sun doesn't change color during sunset.
    - Water slows the speed of light
    - The Eiffel Tower in Paris is taller in the summer than the winter due to expansion of the metal.

Writing quiz_bank.txt


def read_file_into_string(file_path):
    try:
        with open(file_path, 'r') as file:
            file_content = file.read()
            return file_content
    except FileNotFoundError:
        print(f"The file at '{file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")


quiz_bank = read_file_into_string("quiz_bank.txt")


delimiter = "####"
system_message = f"""
Follow these steps to generate a customized quiz for the user.
The question will be delimited with four hashtags i.e {delimiter}

Step 1:{delimiter} First identify the category user is asking about from the following list:
* Geography
* Science
* Art

Step 2:{delimiter} Determine the subjects to generate questions about. The list of topics are below:

{quiz_bank}

Pick up to two subjects that fit the user's category.

Step 3:{delimiter} Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject.
* Include any facts that might be interesting

Use the following format:
Question 1:{delimiter} <question 1>

Question 2:{delimiter} <question 2>

Question 3:{delimiter} <question 3>
"""


from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser

def assistant_chain():
  human_template  = "{question}"

  chat_prompt = ChatPromptTemplate.from_messages([
      ("system", system_message),
      ("human", human_template),
  ])
  return chat_prompt | \
         ChatOpenAI(model="gpt-3.5-turbo",
                    temperature=0) | \
         StrOutputParser()


from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser

def create_eval_chain(context, agent_response):
  eval_system_prompt = """You are an assistant that evaluates \
    how well the quiz assistant creates quizzes for a user by \
    looking at the set of facts available to the assistant.
    Your primary concern is making sure that ONLY facts \
    available are used. Quizzes that contain facts outside
    the question bank are BAD quizzes and harmful to the student."""

  eval_user_message = """You are evaluating a generated quiz \
  based on the context that the assistant uses to create the quiz.
  Here is the data:
    [BEGIN DATA]
    ************
    [Question Bank]: {context}
    ************
    [Quiz]: {agent_response}
    ************
    [END DATA]

Compare the content of the submission with the question bank \
using the following steps

1. Review the question bank carefully. \
  These are the only facts the quiz can reference
2. Compare the quiz to the question bank.
3. Ignore differences in grammar or punctuation
4. If a fact is in the quiz, but not in the question bank \
   the quiz is bad.

Remember, the quizzes need to only include facts the assistant \
  is aware of. It is dangerous to allow made up facts.

Output Y if the quiz only contains facts from the question bank, \
output N if it contains facts that are not in the question bank.
"""
  eval_prompt = ChatPromptTemplate.from_messages([
      ("system", eval_system_prompt),
      ("human", eval_user_message),
  ])

  return eval_prompt | ChatOpenAI(
      model="gpt-3.5-turbo",
      temperature=0) | \
    StrOutputParser()


def test_model_graded_eval_hallucination(quiz_bank):
  assistant = assistant_chain()
  quiz_request = "Write me a quiz about books."
  result = assistant.invoke({"question": quiz_request})
  print(result)
  eval_agent = create_eval_chain(quiz_bank, result)
  eval_response = eval_agent.invoke({"context": quiz_bank, "agent_response": result})
  print(eval_response)
  # Our test asks about a subject not in the context, so the agent should answer N
  assert eval_response == "N"


test_model_graded_eval_hallucination(quiz_bank)

#### First identify the category user is asking about from the following list:
* Geography
* Science
* Art

#### Since you mentioned books, I will choose the category of Art for you.

#### Determine the subjects to generate questions about. The list of topics are below:

1. Subject: Leonardo DaVinci
   Categories: Art, Science
   Facts:
    - Painted the Mona Lisa
    - Studied zoology, anatomy, geology, optics
    - Designed a flying machine
  
2. Subject: Paris
   Categories: Art, Geography
   Facts:
    - Location of the Louvre, the museum where the Mona Lisa is displayed
    - Capital of France
    - Most populous city in France
    - Where Radium and Polonium were discovered by scientists Marie and Pierre Curie

3. Subject: Telescopes
   Category: Science
   Facts:
    - Device to observe different objects
    - The first refracting telescopes were invented in the Netherlands in the 17th Century
    - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror

4. Subject: Starry Night
   Category: Art
   Facts:
    - Painted by Vincent van Gogh in 1889
    - Captures the east-facing view of van Gogh's room in Saint-Rémy-de-Provence

5. Subject: Physics
   Category: Science
   Facts:
    - The sun doesn't change color during sunset.
    - Water slows the speed of light
    - The Eiffel Tower in Paris is taller in the summer than the winter due to expansion of the metal.

#### Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject.
* Include any facts that might be interesting

Question 1:#### Who painted the Mona Lisa?
Question 2:#### In which city is the Louvre located, where the Mona Lisa is displayed?
Question 3:#### What did Vincent van Gogh capture in his painting "Starry Night"?
N


eval_system_prompt = """You are an assistant that evaluates \
how well the quiz assistant
    creates quizzes for a user by looking at the set of \
    facts available to the assistant.
    Your primary concern is making sure that ONLY facts \
    available are used.
    Helpful quizzes only contain facts in the test set."""


eval_user_message = """You are evaluating a generated quiz based on the question bank that the assistant uses to create the quiz.
  Here is the data:
    [BEGIN DATA]
    ************
    [Question Bank]: {context}
    ************
    [Quiz]: {agent_response}
    ************
    [END DATA]

## Examples of quiz questions
Subject: <subject>
   Categories: <category1>, <category2>
   Facts:
    - <fact 1>
    - <fact 2>

## Steps to make a decision
Compare the content of the submission with the question bank using the following steps

1. Review the question bank carefully. These are the only facts the quiz can reference
2. Compare the information in the quiz to the question bank.
3. Ignore differences in grammar or punctuation

Remember, the quizzes should only include information from the question bank.


## Additional rules
- Output an explanation of whether the quiz only references information in the context.
- Make the explanation brief only include a summary of your reasoning for the decsion.
- Include a clear "Yes" or "No" as the first paragraph.
- Reference facts from the quiz bank if the answer is yes

Separate the decision and the explanation. For example:

************
Decision: <Y>
************
Explanation: <Explanation>
************
"""


eval_prompt = ChatPromptTemplate.from_messages([
      ("system", eval_system_prompt),
      ("human", eval_user_message),
  ])
eval_prompt

ChatPromptTemplate(input_variables=['agent_response', 'context'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an assistant that evaluates how well the quiz assistant\n    creates quizzes for a user by looking at the set of     facts available to the assistant.\n    Your primary concern is making sure that ONLY facts     available are used.\n    Helpful quizzes only contain facts in the test set.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['agent_response', 'context'], template='You are evaluating a generated quiz based on the question bank that the assistant uses to create the quiz.\n  Here is the data:\n    [BEGIN DATA]\n    ************\n    [Question Bank]: {context}\n    ************\n    [Quiz]: {agent_response}\n    ************\n    [END DATA]\n\n## Examples of quiz questions\nSubject: <subject>\n   Categories: <category1>, <category2>\n   Facts:\n    - <fact 1>\n    - <fact 2>\n\n## Steps to make a decision\nCompare the content of the submission with the question bank using the following steps\n\n1. Review the question bank carefully. These are the only facts the quiz can reference\n2. Compare the information in the quiz to the question bank.\n3. Ignore differences in grammar or punctuation\n\nRemember, the quizzes should only include information from the question bank.\n\n\n## Additional rules\n- Output an explanation of whether the quiz only references information in the context.\n- Make the explanation brief only include a summary of your reasoning for the decsion.\n- Include a clear "Yes" or "No" as the first paragraph.\n- Reference facts from the quiz bank if the answer is yes\n\nSeparate the decision and the explanation. For example:\n\n************\nDecision: <Y>\n************\nExplanation: <Explanation>\n************\n'))])


# In a real application you would load your dataset from a file or logging tool.
# Here we have a mix of examples with slightly different phrasing that our quiz
# application can support and things we don't support.
test_dataset = [
  {"input": "I'm trying to learn about science, can you give me a quiz to test my knowledge",
   "response": "science",
   "subjects": ["davinci", "telescope", "physics", "curie"]},
  {"input": "I'm an geography expert, give a quiz to prove it?",
   "response": "geography",
   "subjects": ["paris", "france", "louvre"]},
   {"input": "Quiz me about Italy",
   "response": "geography",
   "subjects": ["rome", "alps", "sicily"]
   },
]


def evaluate_dataset(dataset,
                     quiz_bank,
                     assistant,
                     evaluator):
  eval_results = []
  for row in dataset:
    eval_result = {}
    user_input = row["input"]
    answer = assistant.invoke({"question": user_input})
    eval_response = evaluator.invoke({"context": quiz_bank, "agent_response": answer})

    eval_result["input"] = user_input
    eval_result["output"] = answer
    eval_result["grader_response"] = eval_response
    eval_results.append(eval_result)
  return eval_results


def create_eval_chain(prompt):

  return prompt | \
    ChatOpenAI(model="gpt-3.5-turbo",
               temperature=0) | \
    StrOutputParser()


import pandas as pd
from app import assistant_chain, quiz_bank
from IPython.display import display, HTML

from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser


def report_evals(display_to_notebook=False):
  assistant = assistant_chain()
  model_graded_evaluator = create_eval_chain(eval_prompt)
  eval_results = evaluate_dataset(test_dataset,
                                  quiz_bank,
                                  assistant,
                                  model_graded_evaluator)
  df = pd.DataFrame(eval_results)
  ## clean up new lines to be html breaks
  df_html = df.to_html().replace("\\n","<br>")
  if display_to_notebook:
    display(HTML(df_html))
  else:
    print(df_html)


report_evals(display_to_notebook=True)


%%writefile save_eval_artifacts.py
import pandas as pd
from app import assistant_chain, quiz_bank
from IPython.display import display, HTML

from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser

eval_system_prompt = """You are an assistant that evaluates how well the quiz assistant
    creates quizzes for a user by looking at the set of facts available to the assistant.
    Your primary concern is making sure that ONLY facts available are used. Helpful quizzes only contain facts in the
    test set"""

eval_user_message = """You are evaluating a generated quiz based on the question bank that the assistant uses to create the quiz.
  Here is the data:
    [BEGIN DATA]
    ************
    [Question Bank]: {context}
    ************
    [Quiz]: {agent_response}
    ************
    [END DATA]

## Examples of quiz questions
Subject: <subject>
   Categories: <category1>, <category2>
   Facts:
    - <fact 1>
    - <fact 2>

## Steps to make a decision
Compare the content of the submission with the question bank using the following steps

1. Review the question bank carefully. These are the only facts the quiz can reference
2. Compare the information in the quiz to the question bank.
3. Ignore differences in grammar or punctuation

Remember, the quizzes should only include information from the question bank.


## Additional rules
- Output an explanation of whether the quiz only references information in the context.
- Make the explanation brief only include a summary of your reasoning for the decsion.
- Include a clear "Yes" or "No" as the first paragraph.
- Reference facts from the quiz bank if the answer is yes

Separate the decision and the explanation. For example:

************
Decision: <Y>
************
Explanation: <Explanation>
************
"""

# In a real application you would load your dataset from a file or logging tool.
# Here we have a mix of examples with slightly different phrasing that our quiz application can support
# and things we don't support.
dataset = [
    {
        "input": "I'm trying to learn about science, can you give me a quiz to test my knowledge",
        "response": "science",
        "subjects": ["davinci", "telescope", "physics", "curie"],
    },
    {
        "input": "I'm an geography expert, give a quiz to prove it?",
        "response": "geography",
        "subjects": ["paris", "france", "louvre"],
    },
    {
        "input": "Quiz me about Italy",
        "response": "geography",
        "subjects": ["rome", "alps", "sicily"],
    },
]


def create_eval_chain():
    eval_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", eval_system_prompt),
            ("human", eval_user_message),
        ]
    )

    return (
        eval_prompt
        | ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
        | StrOutputParser()
    )


def evaluate_dataset(dataset, quiz_bank, assistant, evaluator):
    eval_results = []
    for row in dataset:
        eval_result = {}
        user_input = row["input"]
        answer = assistant.invoke({"question": user_input})
        eval_response = evaluator.invoke(
            {"context": quiz_bank, "agent_response": answer}
        )

        eval_result["input"] = user_input
        eval_result["output"] = answer
        eval_result["grader_response"] = eval_response
        eval_results.append(eval_result)
    return eval_results


def report_evals():
    assistant = assistant_chain()
    model_graded_evaluator = create_eval_chain()
    eval_results = evaluate_dataset(
        dataset, quiz_bank, assistant, model_graded_evaluator
    )
    df = pd.DataFrame(eval_results)
    ## clean up new lines to be html breaks
    df_html = df.to_html().replace("\\n", "<br>")
    with open("/tmp/eval_results.html", "w") as f:
        f.write(df_html)


def main():
    report_evals()


if __name__ == "__main__":
    main()

Writing save_eval_artifacts.py


%%writefile circle_config.yml
version: 2.1
orbs:
  # The python orb contains a set of prepackaged circleci configuration you can use repeatedly in your configurations files
  # Orb commands and jobs help you with common scripting around a language/tool
  # so you dont have to copy and paste it everywhere.
  # See the orb documentation here: https://circleci.com/developer/orbs/orb/circleci/python
  python: circleci/python@2.1.1

parameters:
  eval-mode:
    type: string
    default: "commit"

workflows:
  evaluate-commit:
    when:
      equal: [ commit, << pipeline.parameters.eval-mode >> ]
    jobs:
      - run-commit-evals:
          context:
            - dl-ai-courses
  evaluate-release:
    when:
      equal: [ release, << pipeline.parameters.eval-mode >> ]
    jobs:
      - run-pre-release-evals:
          context:
            - dl-ai-courses
  evaluate-all:
    when:
      equal: [ full, << pipeline.parameters.eval-mode >> ]
    jobs:
      - run-manual-evals:
          context:
            - dl-ai-courses
  report-evals:
    when:
      equal: [ report, << pipeline.parameters.eval-mode >> ]
    jobs:
      - store-eval-artifacts:
          context:
            - dl-ai-courses

jobs:
  run-commit-evals:  # This is the name of the job, feel free to change it to better match what you're trying to do!
    # These next lines defines a docker executors: https://circleci.com/docs/2.0/executor-types/
    # You can specify an image from dockerhub or use one of the convenience images from CircleCI's Developer Hub
    # A list of available CircleCI docker convenience images are available here: https://circleci.com/developer/images/image/cimg/python
    # The executor is the environment in which the steps below will be executed - below will use a python 3.9 container
    # Change the version below to your required version of python
    docker:
      - image: cimg/python:3.10.5
    # Checkout the code as the first step. This is a dedicated CircleCI step.
    # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default.
    # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt.
    # Then run your tests!
    # CircleCI will report the results back to your VCS provider.
    steps:
      - checkout
      - python/install-packages:
          pkg-manager: pip
          # app-dir: ~/project/package-directory/  # If your requirements.txt isn't in the root directory.
          # pip-dependency-file: test-requirements.txt  # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements.
      - run:
          name: Run assistant evals.
          command: python -m pytest --junitxml results.xml test_hallucinations.py test_with_dataset.py
      - store_test_results:
          path: results.xml
  run-pre-release-evals:
    docker:
      - image: cimg/python:3.10.5
    steps:
      - checkout
      - python/install-packages:
          pkg-manager: pip
          # app-dir: ~/project/package-directory/  # If your requirements.txt isn't in the root directory.
          # pip-dependency-file: test-requirements.txt  # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements.
      - run:
          name: Run release evals.
          command: python -m pytest --junitxml results.xml test_release_evals.py
      - store_test_results:
          path: results.xml
  run-manual-evals:
    docker:
      - image: cimg/python:3.10.5
    steps:
      - checkout
      - python/install-packages:
          pkg-manager: pip
          # app-dir: ~/project/package-directory/  # If your requirements.txt isn't in the root directory.
          # pip-dependency-file: test-requirements.txt  # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements.
      - run:
          name: Run end to end evals.
          command: python -m pytest --junitxml results.xml test_release_evals.py
      - store_test_results:
          path: results.xml
  store-eval-artifacts:
    docker:
      - image: cimg/python:3.10.5
    steps:
      - checkout
      - python/install-packages:
          pkg-manager: pip
          # app-dir: ~/project/package-directory/  # If your requirements.txt isn't in the root directory.
          # pip-dependency-file: test-requirements.txt  # if you have a different name for your requirements file, maybe one that combines your runtime and test requirements.
      - run:
          name: Save eval to html file
          command: python save_eval_artifacts.py
      - store_artifacts:
          path: /tmp/eval_results.html
          destination: eval_results.html

Writing circle_config.yml


push_files_to_github(
    repo_name="gadkins/llmops-evals-example",
    branch_name="main",
    files=["app.py", "save_eval_artifacts.py", "quiz_bank.txt"]
)

uploading quiz_bank.txt
uploading save_eval_artifacts.py
uploading app.py
pushing files to: main
main already exists in the repository pushing updated changes

	input	output	grader_response
0	I'm trying to learn about science, can you give me a quiz to test my knowledge	#### First identify the category user is asking about from the following list: * Geography * Science * Art Science #### Determine the subjects to generate questions about. The list of topics are below: 3. Subject: Telescopes Category: Science Facts: - Device to observe different objects - The first refracting telescopes were invented in the Netherlands in the 17th Century - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror 5. Subject: Physics Category: Science Facts: - The sun doesn't change color during sunset. - Water slows the speed of light - The Eiffel Tower in Paris is taller in the summer than the winter due to expansion of the metal. #### Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject. Question 1:#### What is the James Webb space telescope known for using as its mirror material? a) Silver b) Gold-berillyum c) Aluminum d) Copper Question 2:#### According to physics, why is the Eiffel Tower in Paris taller in the summer than the winter? a) Due to the Earth's rotation b) Due to the expansion of the metal c) Due to the weight of the tourists d) Due to the wind speed Question 3:#### What is the purpose of a refracting telescope? a) To observe different objects b) To study the behavior of light c) To measure the speed of sound d) To analyze chemical compounds	Decision: Yes ************ Explanation: The quiz generated only references information from the question bank. The questions are based on the subjects of Telescopes and Physics, and the facts provided for these subjects are used accurately in the quiz questions.
1	I'm an geography expert, give a quiz to prove it?	#### First identify the category user is asking about from the following list: * Geography * Science * Art #### Determine the subjects to generate questions about. The list of topics are below: 1. Subject: Leonardo DaVinci Categories: Art, Science Facts: - Painted the Mona Lisa - Studied zoology, anatomy, geology, optics - Designed a flying machine 2. Subject: Paris Categories: Art, Geography Facts: - Location of the Louvre, the museum where the Mona Lisa is displayed - Capital of France - Most populous city in France - Where Radium and Polonium were discovered by scientists Marie and Pierre Curie 3. Subject: Telescopes Category: Science Facts: - Device to observe different objects - The first refracting telescopes were invented in the Netherlands in the 17th Century - The James Webb space telescope is the largest telescope in space. It uses a gold-berillyum mirror 4. Subject: Starry Night Category: Art Facts: - Painted by Vincent van Gogh in 1889 - Captures the east-facing view of van Gogh's room in Saint-Rémy-de-Provence 5. Subject: Physics Category: Science Facts: - The sun doesn't change color during sunset. - Water slows the speed of light - The Eiffel Tower in Paris is taller in the summer than the winter due to expansion of the metal. #### Generate a quiz for the user. Based on the selected subjects generate 3 questions for the user using the facts about the subject. Question 1:#### What is the capital of France, where the famous Louvre museum is located? Question 2:#### In which city did scientists Marie and Pierre Curie discover Radium and Polonium? Question 3:#### Which city is known for having the Eiffel Tower, which is taller in the summer than the winter due to metal expansion?	Decision: No Explanation: The quiz includes information about the Eiffel Tower being taller in the summer than the winter due to metal expansion, which is not a fact from the question bank.
2	Quiz me about Italy	I'm sorry I do not have information about that.	Decision: No Explanation: The quiz mentions information about the Eiffel Tower in Paris being taller in the summer than the winter due to the expansion of the metal, which is not a fact present in the question bank. Therefore, the quiz contains information outside the provided question bank.

Prompt Engineering & Automated Evals with LangChain & CircleCI¶

Attribution¶

Why should you read this notebook?¶

Motivation¶

Pre-requisites¶

Source code¶

Table of contents¶

Setup¶

Install dependencies¶

Load keys¶

Rules-based Evals¶

Create sample app¶

Build a prompt template¶

Configure the LLM¶

Configure output parser¶

Create chain¶

Create a function to combine these steps¶

Evaluations¶

Create a function for a failing test case¶

Run evals using CircleCI pipeline¶

Run evals on every commit¶

CircleCI Config¶

Create requirements.txt¶

Push code to GitHub¶

(Optional) Manually trigger CircleCI pipeline¶

Model-graded Evals¶

Build prompt for model grading¶

Create known good test fixure¶

Structure the prompt¶

Create prompt template with LangChain¶

Configure an LLM for evaluation¶

Setup output parser¶

Create chain¶

Create eval chain function¶

Create known bad test fixture¶

Combine model-graded evals into a single file¶

Push evals to GitHub¶

Hallucination Detection¶

Rebuild quiz generator¶

Create grader eval chain¶

Create model to test for hallucinations¶

Add explanations for LLM grader's decision¶

Create a new test set¶

Generate eval reports¶

Combine into comprehensive eval with reporting¶

Update CircleCI config to store generated reports¶

Push to GitHub and trigger evals pipeline¶