SingleStore Notebooks
New
Launch Open-Source Apps with LangChain
Notebook
In [1]:
%%writefile requirements.txtjinja2==3.0.3langchain==0.0.339openai==1.3.3pdf2imagepdfminerpdfminer.sixpillow_heiftabulatetiktokenunstructuredopencv-python-headlessunstructured.pytesseractunstructured.inference
In [2]:
%pip install -r requirements.txt --quiet
In [3]:
from langchain.document_loaders import OnlinePDFLoaderloader = OnlinePDFLoader("http://leavcom.com/pdf/DBpdf.pdf")data = loader.load()
In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitterprint (f"You have {len(data)} document(s) in your data")print (f"There are {len(data[0].page_content)} characters in your document")
In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)texts = text_splitter.split_documents(data)print (f"You have {len(texts)} pages")
In [6]:
%%sqlDROP DATABASE IF EXISTS pdf_db;CREATE DATABASE IF NOT EXISTS pdf_db;
Action Required
Make sure to select the pdf_db database from the drop-down menu at the top of this notebook. It updates the connection_url to connect to that database.
In [7]:
%%sqlDROP TABLE IF EXISTS pdf_docs1;CREATE TABLE IF NOT EXISTS pdf_docs1 (id INT PRIMARY KEY,content TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,vector BLOB);
In [8]:
import osimport getpassos.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
In [9]:
import jsonimport sqlalchemy as safrom langchain.embeddings import OpenAIEmbeddingsfrom singlestoredb import create_engineconn = create_engine().connect()embedder = OpenAIEmbeddings()# Fetch all embeddings in one callembeddings = embedder.embed_documents([doc.page_content for doc in texts])# Build query parametersparams = []for i, (text_content, embedding) in enumerate(zip(texts, embeddings)):params.append(dict(id=i+1, content=text_content, vector=json.dumps(embedding)))stmt = sa.text("""INSERT INTO pdf_docs1 (id,content,vector)VALUES (:id,:content,JSON_ARRAY_PACK_F32(:vector))""")conn.execute(stmt, params)
In [10]:
%%sqlSELECT JSON_ARRAY_UNPACK_F32(vector) as vectorFROM pdf_docs1LIMIT 1;
In [11]:
query_text = "Will object-oriented databases be commercially successful?"query_embedding = embedder.embed_documents([query_text])[0]stmt = sa.text("""SELECTcontent,DOT_PRODUCT_F32(JSON_ARRAY_PACK_F32(:embedding), vector) AS scoreFROM pdf_docs1ORDER BY score DESCLIMIT 1""")results = conn.execute(stmt, dict(embedding=json.dumps(query_embedding)))for row in results:print(row[0])
In [12]:
import openaiclient = openai.OpenAI()prompt = f"The user asked: {query_text}. The most similar text from the document is: {row[0]}"response = client.chat.completions.create(model="gpt-3.5-turbo",messages=[{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": prompt}])print(response.choices[0].message.content)
Clean up
In [13]:
%%sqlDROP DATABASE IF EXISTS pdf_db
Details
Tags
#vectordb#genai#langchain
License
This Notebook has been released under the Apache 2.0 open source license.