Merge pull request #2543 from moj-analytical-services/improve_llm_pro…

…mpts improve llm prompt
moj-analytical-services · Dec 6, 2024 · 75e1728 · 75e1728
2 parents 8d78b98 + 5b94847
commit 75e1728
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 5 deletions.
diff --git a/docs/topic_guides/llms/prompting_llms.md b/docs/topic_guides/llms/prompting_llms.md
@@ -2,9 +2,9 @@
 
 We provide two files that summarise the Splink docs to help you use Splink with LLMs:
 
-[Concise Splink Documentation Summary](https://gist.githubusercontent.com/RobinL/2257a0253d80ab8e651a5ec08bc3d00a/raw/e3b00bcd07cce609cf4ee60908d9e6d39c5b9c5f/llm_context_short.txt) - around 22,000 tokens
+[Concise Splink Documentation Summary](https://gist.githubusercontent.com/RobinL/2257a0253d80ab8e651a5ec08bc3d00a/raw/5ddbe58d2cf9e8755a211d6091ce5ad6511fe506/llm_context_short.txt) - around 25,000 tokens
 
-[Comprehensive Splink Documentation Summary](https://gist.githubusercontent.com/RobinL/2257a0253d80ab8e651a5ec08bc3d00a/raw/e3b00bcd07cce609cf4ee60908d9e6d39c5b9c5f/llm_context_long.txt) - around 72,000 tokens
+[Comprehensive Splink Documentation Summary](https://gist.githubusercontent.com/RobinL/2257a0253d80ab8e651a5ec08bc3d00a/raw/5ddbe58d2cf9e8755a211d6091ce5ad6511fe506/llm_context_long.txt) - around 83,000 tokens
 
 At present, only the concise one is short enough for the ChatGPT GUI. The longer one fits in the Claude long context (200k tokens) model.
 ### Recommended use

diff --git a/scripts/create_llm_prompt_long.py b/scripts/create_llm_prompt_long.py
@@ -4,6 +4,7 @@
 import urllib.request
 
 import nbformat
+from bs4 import BeautifulSoup
 
 import splink.blocking_analysis as blocking_analysis
 import splink.comparison_level_library as cll
@@ -221,6 +222,17 @@ def fetch_url_content(url):
         return response.read().decode("utf-8")
 
 
+def fetch_article_content(url):
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"  # NOQA: E501
+    }
+    req = urllib.request.Request(url, headers=headers)
+    with urllib.request.urlopen(req) as response:
+        html = response.read().decode("utf-8")
+    soup = BeautifulSoup(html, "html.parser")
+    return soup.get_text(strip=True)
+
+
 additional_instructions_to_llms = """
 If the user asks for Splink code, when writing Splink code use ONLY
 functions and methods which you've seen used in the context provided. Do NOT guess at
@@ -330,12 +342,32 @@ def fetch_url_content(url):
     extract_and_append_md_content(mds_to_append, output_filename)
 
     # Fetch and append content from the URL
-    url = "https://gist.githubusercontent.com/RobinL/edb10e93caeaf47c675cbfa189e4e30c/raw/fbe773db3002663dd3ddb439e38d2a549358e713/top_tips.md"
+    url = "https://gist.githubusercontent.com/RobinL/edb10e93caeaf47c675cbfa189e4e30c/raw/fbe773db3002663dd3ddb439e38d2a549358e713/top_tips.md"  # NOQA: E501
     splink_tips = fetch_url_content(url)
     with open(output_filename, "a", encoding="utf-8") as f:
         f.write("\n\nSplink Tips:\n")
         f.write(splink_tips)
 
+    # Add the blog articles
+    blog_urls = [
+        "https://www.robinlinacre.com/intro_to_probabilistic_linkage/",
+        "https://www.robinlinacre.com/partial_match_weights/",
+        "https://www.robinlinacre.com/m_and_u_values/",
+        "https://www.robinlinacre.com/maths_of_fellegi_sunter/",
+        "https://www.robinlinacre.com/computing_fellegi_sunter/",
+        "https://www.robinlinacre.com/fellegi_sunter_accuracy/",
+        "https://www.robinlinacre.com/em_intuition/",
+    ]
+
+    with open(output_filename, "a", encoding="utf-8") as f:
+        f.write("\n\nBlog Articles:\n")
+        for url in blog_urls:
+            print(f"Fetching article from {url}...")  # NOQA: T201
+            content = fetch_article_content(url)
+            f.write(f"\n\nArticle from {url}:\n")
+            f.write(content)
+            f.write("\n\n")
+
     # Append additional instructions to the output file
     with open(output_filename, "a", encoding="utf-8") as f:
         f.write("IMPORTANT Instructions to LLMs:")

diff --git a/scripts/create_llm_prompt_short.py b/scripts/create_llm_prompt_short.py
@@ -2,6 +2,7 @@
 import urllib.request
 
 import nbformat
+from bs4 import BeautifulSoup
 
 
 # Function to extract only the Python code from input cells of type code
@@ -26,7 +27,19 @@ def extract_and_append_notebook_code(base_dir, output_filename):
             if file.endswith(".ipynb") and not file.endswith("-checkpoint.ipynb"):
                 notebook_path = os.path.join(root, file)
                 # Skip files with athena or sqlite in path
-                if any(x in notebook_path.lower() for x in ["athena", "sqlite"]):
+                if any(
+                    x in notebook_path.lower()
+                    for x in [
+                        "athena",
+                        "sqlite",
+                        "bias_eval",
+                        "febrl3",
+                        "deterministic",
+                        "quick_and_dirty",
+                        "real_time",
+                        "playground",
+                    ]
+                ):
                     print(f"Skipping {notebook_path} due to athena/sqlite...")  # noqa: T201
                     continue
 
@@ -64,6 +77,17 @@ def fetch_url_content(url):
         return response.read().decode("utf-8")
 
 
+def fetch_article_content(url):
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"  # NOQA: E501
+    }
+    req = urllib.request.Request(url, headers=headers)
+    with urllib.request.urlopen(req) as response:
+        html = response.read().decode("utf-8")
+    soup = BeautifulSoup(html, "html.parser")
+    return soup.get_text(strip=True)
+
+
 additional_instructions_to_llms = """
 If the user asks for Splink code, when writing Splink code use ONLY
 functions and methods which you've seen used in the context provided. Do NOT guess at
@@ -99,12 +123,32 @@ def fetch_url_content(url):
     extract_and_append_md_content(mds_to_append, output_filename)
 
     # Fetch and append content from the URL
-    url = "https://gist.githubusercontent.com/RobinL/edb10e93caeaf47c675cbfa189e4e30c/raw/fbe773db3002663dd3ddb439e38d2a549358e713/top_tips.md"
+    url = "https://gist.githubusercontent.com/RobinL/edb10e93caeaf47c675cbfa189e4e30c/raw/fbe773db3002663dd3ddb439e38d2a549358e713/top_tips.md"  # NOQA: E501
     splink_tips = fetch_url_content(url)
     with open(output_filename, "a", encoding="utf-8") as f:
         f.write("\n\nSplink Tips:\n")
         f.write(splink_tips)
 
+    # Add the blog articles
+    blog_urls = [
+        "https://www.robinlinacre.com/intro_to_probabilistic_linkage/",
+        "https://www.robinlinacre.com/partial_match_weights/",
+        "https://www.robinlinacre.com/m_and_u_values/",
+        # "https://www.robinlinacre.com/maths_of_fellegi_sunter/",
+        # "https://www.robinlinacre.com/computing_fellegi_sunter/",
+        "https://www.robinlinacre.com/fellegi_sunter_accuracy/",
+        "https://www.robinlinacre.com/em_intuition/",
+    ]
+
+    with open(output_filename, "a", encoding="utf-8") as f:
+        f.write("\n\nBlog Articles:\n")
+        for url in blog_urls:
+            print(f"Fetching article from {url}...")  # noqa: T201
+            content = fetch_article_content(url)
+            f.write(f"\n\nArticle from {url}:\n")
+            f.write(content)
+            f.write("\n\n")
+
     # Append additional instructions to the output file
     with open(output_filename, "a", encoding="utf-8") as f:
         f.write("IMPORTANT Instructions to LLMs:")