Skip to content

Commit

Permalink
Merge pull request #2543 from moj-analytical-services/improve_llm_pro…
Browse files Browse the repository at this point in the history
…mpts

improve llm prompt
  • Loading branch information
RobinL authored Dec 6, 2024
2 parents 8d78b98 + 5b94847 commit 75e1728
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 5 deletions.
4 changes: 2 additions & 2 deletions docs/topic_guides/llms/prompting_llms.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

We provide two files that summarise the Splink docs to help you use Splink with LLMs:

[Concise Splink Documentation Summary](https://gist.githubusercontent.com/RobinL/2257a0253d80ab8e651a5ec08bc3d00a/raw/e3b00bcd07cce609cf4ee60908d9e6d39c5b9c5f/llm_context_short.txt) - around 22,000 tokens
[Concise Splink Documentation Summary](https://gist.githubusercontent.com/RobinL/2257a0253d80ab8e651a5ec08bc3d00a/raw/5ddbe58d2cf9e8755a211d6091ce5ad6511fe506/llm_context_short.txt) - around 25,000 tokens

[Comprehensive Splink Documentation Summary](https://gist.githubusercontent.com/RobinL/2257a0253d80ab8e651a5ec08bc3d00a/raw/e3b00bcd07cce609cf4ee60908d9e6d39c5b9c5f/llm_context_long.txt) - around 72,000 tokens
[Comprehensive Splink Documentation Summary](https://gist.githubusercontent.com/RobinL/2257a0253d80ab8e651a5ec08bc3d00a/raw/5ddbe58d2cf9e8755a211d6091ce5ad6511fe506/llm_context_long.txt) - around 83,000 tokens

At present, only the concise one is short enough for the ChatGPT GUI. The longer one fits in the Claude long context (200k tokens) model.
### Recommended use
Expand Down
34 changes: 33 additions & 1 deletion scripts/create_llm_prompt_long.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import urllib.request

import nbformat
from bs4 import BeautifulSoup

import splink.blocking_analysis as blocking_analysis
import splink.comparison_level_library as cll
Expand Down Expand Up @@ -221,6 +222,17 @@ def fetch_url_content(url):
return response.read().decode("utf-8")


def fetch_article_content(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # NOQA: E501
}
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req) as response:
html = response.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
return soup.get_text(strip=True)


additional_instructions_to_llms = """
If the user asks for Splink code, when writing Splink code use ONLY
functions and methods which you've seen used in the context provided. Do NOT guess at
Expand Down Expand Up @@ -330,12 +342,32 @@ def fetch_url_content(url):
extract_and_append_md_content(mds_to_append, output_filename)

# Fetch and append content from the URL
url = "https://gist.githubusercontent.com/RobinL/edb10e93caeaf47c675cbfa189e4e30c/raw/fbe773db3002663dd3ddb439e38d2a549358e713/top_tips.md"
url = "https://gist.githubusercontent.com/RobinL/edb10e93caeaf47c675cbfa189e4e30c/raw/fbe773db3002663dd3ddb439e38d2a549358e713/top_tips.md" # NOQA: E501
splink_tips = fetch_url_content(url)
with open(output_filename, "a", encoding="utf-8") as f:
f.write("\n\nSplink Tips:\n")
f.write(splink_tips)

# Add the blog articles
blog_urls = [
"https://www.robinlinacre.com/intro_to_probabilistic_linkage/",
"https://www.robinlinacre.com/partial_match_weights/",
"https://www.robinlinacre.com/m_and_u_values/",
"https://www.robinlinacre.com/maths_of_fellegi_sunter/",
"https://www.robinlinacre.com/computing_fellegi_sunter/",
"https://www.robinlinacre.com/fellegi_sunter_accuracy/",
"https://www.robinlinacre.com/em_intuition/",
]

with open(output_filename, "a", encoding="utf-8") as f:
f.write("\n\nBlog Articles:\n")
for url in blog_urls:
print(f"Fetching article from {url}...") # NOQA: T201
content = fetch_article_content(url)
f.write(f"\n\nArticle from {url}:\n")
f.write(content)
f.write("\n\n")

# Append additional instructions to the output file
with open(output_filename, "a", encoding="utf-8") as f:
f.write("IMPORTANT Instructions to LLMs:")
Expand Down
48 changes: 46 additions & 2 deletions scripts/create_llm_prompt_short.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import urllib.request

import nbformat
from bs4 import BeautifulSoup


# Function to extract only the Python code from input cells of type code
Expand All @@ -26,7 +27,19 @@ def extract_and_append_notebook_code(base_dir, output_filename):
if file.endswith(".ipynb") and not file.endswith("-checkpoint.ipynb"):
notebook_path = os.path.join(root, file)
# Skip files with athena or sqlite in path
if any(x in notebook_path.lower() for x in ["athena", "sqlite"]):
if any(
x in notebook_path.lower()
for x in [
"athena",
"sqlite",
"bias_eval",
"febrl3",
"deterministic",
"quick_and_dirty",
"real_time",
"playground",
]
):
print(f"Skipping {notebook_path} due to athena/sqlite...") # noqa: T201
continue

Expand Down Expand Up @@ -64,6 +77,17 @@ def fetch_url_content(url):
return response.read().decode("utf-8")


def fetch_article_content(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # NOQA: E501
}
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req) as response:
html = response.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
return soup.get_text(strip=True)


additional_instructions_to_llms = """
If the user asks for Splink code, when writing Splink code use ONLY
functions and methods which you've seen used in the context provided. Do NOT guess at
Expand Down Expand Up @@ -99,12 +123,32 @@ def fetch_url_content(url):
extract_and_append_md_content(mds_to_append, output_filename)

# Fetch and append content from the URL
url = "https://gist.githubusercontent.com/RobinL/edb10e93caeaf47c675cbfa189e4e30c/raw/fbe773db3002663dd3ddb439e38d2a549358e713/top_tips.md"
url = "https://gist.githubusercontent.com/RobinL/edb10e93caeaf47c675cbfa189e4e30c/raw/fbe773db3002663dd3ddb439e38d2a549358e713/top_tips.md" # NOQA: E501
splink_tips = fetch_url_content(url)
with open(output_filename, "a", encoding="utf-8") as f:
f.write("\n\nSplink Tips:\n")
f.write(splink_tips)

# Add the blog articles
blog_urls = [
"https://www.robinlinacre.com/intro_to_probabilistic_linkage/",
"https://www.robinlinacre.com/partial_match_weights/",
"https://www.robinlinacre.com/m_and_u_values/",
# "https://www.robinlinacre.com/maths_of_fellegi_sunter/",
# "https://www.robinlinacre.com/computing_fellegi_sunter/",
"https://www.robinlinacre.com/fellegi_sunter_accuracy/",
"https://www.robinlinacre.com/em_intuition/",
]

with open(output_filename, "a", encoding="utf-8") as f:
f.write("\n\nBlog Articles:\n")
for url in blog_urls:
print(f"Fetching article from {url}...") # noqa: T201
content = fetch_article_content(url)
f.write(f"\n\nArticle from {url}:\n")
f.write(content)
f.write("\n\n")

# Append additional instructions to the output file
with open(output_filename, "a", encoding="utf-8") as f:
f.write("IMPORTANT Instructions to LLMs:")
Expand Down

0 comments on commit 75e1728

Please sign in to comment.