Add supabase as vector DB

kaarthik108 · Jun 25, 2023 · 0b2a2c9 · 0b2a2c9
1 parent 3e30ee6
commit 0b2a2c9
Show file tree

Hide file tree

Showing 18 changed files with 320 additions and 205 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@
 [![Streamlit](https://img.shields.io/badge/-Streamlit-FF4B4B?style=flat-square&logo=streamlit&logoColor=white)](https://streamlit.io/)
 [![OpenAI](https://img.shields.io/badge/-OpenAI-412991?style=flat-square&logo=openai&logoColor=white)](https://openai.com/)
 [![Snowflake](https://img.shields.io/badge/-Snowflake-29BFFF?style=flat-square&logo=snowflake&logoColor=white)](https://www.snowflake.com/en/)
+[![Supabase](https://img.shields.io/badge/-Supabase-00C04A?style=flat-square&logo=supabase&logoColor=white)](https://www.supabase.io/)
 
 [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://snowchat.streamlit.app/)
 
@@ -18,7 +19,7 @@
 - Interactive and user-friendly interface
 - Integration with Snowflake Data Warehouse
 - Utilizes OpenAI's GPT-3.5-turbo-16k and text-embedding-ada-002
-- Uses In-memory Vector Database FAISS for storing and searching through vectors
+- Uses Supabase PG-vector Vector Database for storing and searching through vectors
 
 ## 🛠️ Installation
 
@@ -29,13 +30,15 @@
    cd snowchat
    pip install -r requirements.txt
 
-3. Set up your `OPENAI_API_KEY`, `ACCOUNT`, `USER_NAME`, `PASSWORD`, `ROLE`, `DATABASE`, `SCHEMA` and `WAREHOUSE` in project directory `secrets.toml`. If you don't have access to GPT-4 change the script in chain.py replace gpt-4 in model_name to gpt-3.5-turbo
+3. Set up your `OPENAI_API_KEY`, `ACCOUNT`, `USER_NAME`, `PASSWORD`, `ROLE`, `DATABASE`, `SCHEMA`,  `WAREHOUSE`, `SUPABASE_URL` and `SUPABASE_SERVICE_KEY` in project directory `secrets.toml`.
 
-4. Make you're schema.md that matches you're database.
+4. Make you're schemas and store them in docs folder that matches you're database.
 
-5. Run `python ingest.py` to get convert to embeddings and store as an index file.
+5. Create supabase extention, table and function from the supabase/scripts.sql.
 
-6. Run the Streamlit app to start chatting:
+6. Run `python ingest.py` to get convert to embeddings and store as an index file.
+
+7. Run the Streamlit app to start chatting:
    streamlit run main.py
 
 ## 📚 Usage

diff --git a/chain.py b/chain.py
@@ -1,8 +1,5 @@
 from langchain.prompts.prompt import PromptTemplate
-from langchain.chains import (
-    ConversationalRetrievalChain,
-    LLMChain
-)
+from langchain.chains import ConversationalRetrievalChain, LLMChain
 from langchain.chains.question_answering import load_qa_chain
 from langchain.llms import OpenAI
 import streamlit as st
@@ -15,7 +12,7 @@
 {question}
 \"""
 Standalone question:"""
- 
+
 condense_question_prompt = PromptTemplate.from_template(template)
 
 TEMPLATE = """ You're a helpful AI assistant who is specialized in data analysis using SQL. You have to write sql code in snowflake database based on the following question. Give a one or two sentences about how did you arrive at that sql code. (do not assume anything if the column is not available then say it is not available, do not make up code). Write the sql code in markdown format.
@@ -25,35 +22,32 @@
 
 Answer:
 
-"""  
+"""
 QA_PROMPT = PromptTemplate(template=TEMPLATE, input_variables=["question", "context"])
 
 
 def get_chain(vectorstore):
     """
     Get a chain for chatting with a vector database.
     """
-    q_llm = OpenAI(temperature=0, openai_api_key=st.secrets["OPENAI_API_KEY"], model_name='gpt-3.5-turbo-16k')
-
-    llm = OpenAI(
-        model_name='gpt-3.5-turbo',
+    q_llm = OpenAI(
         temperature=0,
-        openai_api_key=st.secrets["OPENAI_API_KEY"]
-    )
-
-    question_generator = LLMChain(
-        llm=q_llm,
-        prompt=condense_question_prompt
+        openai_api_key=st.secrets["OPENAI_API_KEY"],
+        model_name="gpt-3.5-turbo-16k",
     )
-    
-    doc_chain = load_qa_chain(
-        llm=llm,
-        chain_type="stuff",
-        prompt=QA_PROMPT
+
+    llm = OpenAI(
+        model_name="gpt-3.5-turbo",
+        temperature=0,
+        openai_api_key=st.secrets["OPENAI_API_KEY"],
     )
+
+    question_generator = LLMChain(llm=q_llm, prompt=condense_question_prompt)
+
+    doc_chain = load_qa_chain(llm=llm, chain_type="stuff", prompt=QA_PROMPT)
     chain = ConversationalRetrievalChain(
-                retriever=vectorstore.as_retriever(),
-                combine_docs_chain=doc_chain,
-                question_generator=question_generator
-                )
-    return chain
+        retriever=vectorstore.as_retriever(),
+        combine_docs_chain=doc_chain,
+        question_generator=question_generator,
+    )
+    return chain
diff --git a/docs/customer_details.md b/docs/customer_details.md
@@ -0,0 +1,10 @@
+**Table 1: STREAM_HACKATHON.STREAMLIT.CUSTOMER_DETAILS** (Stores customer information)
+
+This table contains the personal information of customers who have made purchases on the platform.
+
+- CUSTOMER_ID: Number (38,0) [Primary Key, Not Null] - Unique identifier for customers
+- FIRST_NAME: Varchar (255) - First name of the customer
+- LAST_NAME: Varchar (255) - Last name of the customer
+- EMAIL: Varchar (255) - Email address of the customer
+- PHONE: Varchar (20) - Phone number of the customer
+- ADDRESS: Varchar (255) - Physical address of the customer
diff --git a/docs/order_details.md b/docs/order_details.md
@@ -0,0 +1,8 @@
+**Table 2: STREAM_HACKATHON.STREAMLIT.ORDER_DETAILS** (Stores order information)
+
+This table contains information about orders placed by customers, including the date and total amount of each order.
+
+- ORDER_ID: Number (38,0) [Primary Key, Not Null] - Unique identifier for orders
+- CUSTOMER_ID: Number (38,0) [Foreign Key - CUSTOMER_DETAILS(CUSTOMER_ID)] - Customer who made the order
+- ORDER_DATE: Date - Date when the order was made
+- TOTAL_AMOUNT: Number (10,2) - Total amount of the order
diff --git a/docs/payments.md b/docs/payments.md
@@ -0,0 +1,8 @@
+**Table 3: STREAM_HACKATHON.STREAMLIT.PAYMENTS** (Stores payment information)
+
+This table contains information about payments made by customers for their orders, including the date and amount of each payment.
+
+- PAYMENT_ID: Number (38,0) [Primary Key, Not Null] - Unique identifier for payments
+- ORDER_ID: Number (38,0) [Foreign Key - ORDER_DETAILS(ORDER_ID)] - Associated order for the payment
+- PAYMENT_DATE: Date - Date when the payment was made
+- AMOUNT: Number (10,2) - Amount of the payment
diff --git a/docs/products.md b/docs/products.md
@@ -0,0 +1,8 @@
+**Table 4: STREAM_HACKATHON.STREAMLIT.PRODUCTS** (Stores product information)
+
+This table contains information about the products available for purchase on the platform, including their name, category, and price.
+
+- PRODUCT_ID: Number (38,0) [Primary Key, Not Null] - Unique identifier for products
+- PRODUCT_NAME: Varchar (255) - Name of the product
+- CATEGORY: Varchar (255) - Category of the product
+- PRICE: Number (10,2) - Price of the product
diff --git a/docs/transactions.md b/docs/transactions.md
@@ -0,0 +1,9 @@
+**Table 5: STREAM_HACKATHON.STREAMLIT.TRANSACTIONS** (Stores transaction information)
+
+This table contains information about individual transactions that occur when customers purchase products, including the associated order, product, quantity, and price.
+
+- TRANSACTION_ID: Number (38,0) [Primary Key, Not Null] - Unique identifier for transactions
+- ORDER_ID: Number (38,0) [Foreign Key - ORDER_DETAILS(ORDER_ID)] - Associated order for the transaction
+- PRODUCT_ID: Number (38,0) - Product involved in the transaction
+- QUANTITY: Number (38,0) - Quantity of the product in the transaction
+- PRICE: Number (10,2) - Price of the product in the transaction
diff --git a/faiss_index/index.faiss b/faiss_index/index.faiss
diff --git a/faiss_index/index.pkl b/faiss_index/index.pkl
diff --git a/ingest.py b/ingest.py
@@ -1,20 +1,57 @@
-
-from langchain.embeddings import OpenAIEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.document_loaders import UnstructuredMarkdownLoader
-from langchain.vectorstores import FAISS
+from pydantic import BaseModel
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores import SupabaseVectorStore
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.document_loaders import DirectoryLoader
 import streamlit as st
+from supabase.client import Client, create_client
+from typing import Any, Dict
+
+
+class Secrets(BaseModel):
+    SUPABASE_URL: str
+    SUPABASE_SERVICE_KEY: str
+    OPENAI_API_KEY: str
+
+
+class Config(BaseModel):
+    chunk_size: int = 1000
+    chunk_overlap: int = 0
+    docs_dir: str = "docs/"
+    docs_glob: str = "**/*.md"
+
+
+class DocumentProcessor:
+    def __init__(self, secrets: Secrets, config: Config):
+        self.client: Client = create_client(
+            secrets.SUPABASE_URL, secrets.SUPABASE_SERVICE_KEY
+        )
+        self.loader = DirectoryLoader(config.docs_dir, glob=config.docs_glob)
+        self.text_splitter = CharacterTextSplitter(
+            chunk_size=config.chunk_size, chunk_overlap=config.chunk_overlap
+        )
+        self.embeddings = OpenAIEmbeddings(openai_api_key=secrets.OPENAI_API_KEY)
 
-loader = UnstructuredMarkdownLoader('schema.md')
-data = loader.load()
+    def process(self) -> Dict[str, Any]:
+        data = self.loader.load()
+        texts = self.text_splitter.split_documents(data)
+        vector_store = SupabaseVectorStore.from_documents(
+            texts, self.embeddings, client=self.client
+        )
+        return vector_store
 
-text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
-texts = text_splitter.split_documents(data)
 
-embeddings = OpenAIEmbeddings(openai_api_key = st.secrets["OPENAI_API_KEY"])
-docsearch = FAISS.from_documents(texts, embeddings)
+def run():
+    secrets = Secrets(
+        SUPABASE_URL=st.secrets["SUPABASE_URL"],
+        SUPABASE_SERVICE_KEY=st.secrets["SUPABASE_SERVICE_KEY"],
+        OPENAI_API_KEY=st.secrets["OPENAI_API_KEY"],
+    )
+    config = Config()
+    doc_processor = DocumentProcessor(secrets, config)
+    result = doc_processor.process()
+    return result
 
-docsearch.save_local("faiss_index")
 
-# with open("vectors.pkl", "wb") as f:
-#     pickle.dump(docsearch, f)
+if __name__ == "__main__":
+    run()