Skip to content

Commit 0d7a72c

Browse files
authored
Merge pull request #57 from epsilla-cloud/dev
merge dev into main
2 parents f4f2767 + f8c992c commit 0d7a72c

11 files changed

+475
-338
lines changed

examples/Question_Answering_Pipeline_with_LangChain_and_Epsilla.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
# Step2. Configure the OpenAI API Key
1818
import os
19+
1920
os.environ["OPENAI_API_KEY"] = "Your-OpenAI-API-Key"
2021

2122

@@ -24,9 +25,13 @@
2425
from langchain.text_splitter import CharacterTextSplitter
2526
from langchain_openai import OpenAIEmbeddings
2627

27-
loader = WebBaseLoader("https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt")
28+
loader = WebBaseLoader(
29+
"https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt"
30+
)
2831
documents = loader.load()
29-
documents = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(documents)
32+
documents = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(
33+
documents
34+
)
3035
embeddings = OpenAIEmbeddings()
3136

3237

@@ -49,8 +54,6 @@
4954
)
5055

5156

52-
53-
5457
# Step4. Create the QA for Retrieval
5558
from langchain.chains import RetrievalQA
5659
from langchain_openai import OpenAI

examples/Question_Answering_Pipeline_with_LangChain_and_EpsillaCloud.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,55 +18,61 @@
1818
import os
1919

2020
os.environ["OPENAI_API_KEY"] = "Your-OpenAI-API-Key"
21-
epsilla_api_key = os.getenv("EPSILLA_API_KEY", "Your-Epsilla-API-Key")
22-
project_id = os.getenv("EPSILLA_PROJECT_ID", "Your-Project-ID")
23-
db_id = os.getenv("EPSILLA_DB_ID", "Your-DB-ID")
24-
db_sharding_id = os.getenv("EPSILLA_DB_SHARDING_ID", 0)
2521

22+
EPSILLA_PROJECT_ID = os.getenv("EPSILLA_PROJECT_ID", "Your-Epsilla-Project-ID")
23+
EPSILLA_API_KEY = os.getenv("EPSILLA_API_KEY", "Your-Epsilla-API-Key")
24+
EPSILLA_DB_ID = os.getenv("EPSILLA_DB_ID", "Your-Epsilla-DB-ID")
25+
EPSILLA_DB_SHARDING_ID = os.getenv("EPSILLA_DB_SHARDING_ID", 0)
26+
27+
TABLE_NAME = os.getenv("TABLE_NAME", "MyTable")
28+
29+
db_name = f"db_{EPSILLA_DB_ID.replace('-', '_')}"
30+
db_path = f"/data/{EPSILLA_PROJECT_ID}/{db_name}/s{EPSILLA_DB_SHARDING_ID}"
31+
32+
33+
from langchain.text_splitter import CharacterTextSplitter
2634

2735
# Step3. Load the documents
2836
from langchain_community.document_loaders import WebBaseLoader
29-
from langchain.text_splitter import CharacterTextSplitter
3037
from langchain_openai import OpenAIEmbeddings
3138

32-
loader = WebBaseLoader("https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt")
39+
loader = WebBaseLoader(
40+
"https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt"
41+
)
3342
documents = loader.load()
34-
documents = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(documents)
43+
documents = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_documents(
44+
documents
45+
)
3546
embeddings = OpenAIEmbeddings()
3647

3748

3849
# Step4. Load the vector store
3950
from langchain_community.vectorstores import Epsilla
4051
from pyepsilla import cloud
4152

42-
db_name = f"db_{db_id.replace('-', '_')}"
43-
db_path = f"/data/{project_id}/{db_name}/s{db_sharding_id}"
44-
table_name = "MyCollection"
45-
4653
# Step4.1 Connect to Epsilla Cloud
4754
cloud_client = cloud.Client(
48-
project_id=project_id,
49-
api_key=epsilla_api_key,
55+
project_id=EPSILLA_PROJECT_ID,
56+
api_key=EPSILLA_API_KEY,
5057
)
5158

5259
# Step4.2 Connect to Vectordb
53-
db_client = cloud_client.vectordb(db_id)
60+
db_client = cloud_client.vectordb(EPSILLA_DB_ID)
5461

5562
vector_store = Epsilla.from_documents(
5663
documents,
5764
embeddings,
5865
db_client,
5966
db_path=db_path,
6067
db_name=db_name,
61-
collection_name=table_name,
68+
collection_name=TABLE_NAME,
6269
)
6370

6471
# query = "What did the president say about Ketanji Brown Jackson"
6572
# docs = vector_store.similarity_search(query)
6673
# print(docs[0].page_content)
6774

6875

69-
7076
# Step5. Create the QA for Retrieval
7177
from langchain.chains import RetrievalQA
7278
from langchain_openai import OpenAI

examples/gist-960-euclidean.py

Lines changed: 38 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,72 +7,85 @@
77
# 3. wget http://ann-benchmarks.com/gist-960-euclidean.hdf5
88
# 4. python3 gist-960-euclidean.py
99

10-
from pyepsilla import vectordb
11-
import os, h5py, datetime
10+
import datetime
11+
import os
1212
from urllib.parse import urlparse
1313

14-
## Connect to Epsilla vector database
15-
client = vectordb.Client(host='127.0.0.1', port='8888')
16-
client.load_db(db_name="benchmark", db_path="/tmp/epsilla", vector_scale=1000000, wal_enabled=False) ## pay attention to change db_path to persistent volume for production environment
14+
import h5py
15+
from pyepsilla import vectordb
16+
17+
# Connect to Epsilla vector database
18+
client = vectordb.Client(host="127.0.0.1", port="8888")
19+
client.load_db(
20+
db_name="benchmark", db_path="/tmp/epsilla", vector_scale=1000000, wal_enabled=False
21+
) # pay attention to change db_path to persistent volume for production environment
1722
client.use_db(db_name="benchmark")
1823

19-
## Check gist-960-euclidean dataset hdf5 file to download or not
24+
# Check gist-960-euclidean dataset hdf5 file to download or not
2025
dataset_download_url = "http://ann-benchmarks.com/gist-960-euclidean.hdf5"
2126
dataset_filename = os.path.basename(urlparse(dataset_download_url).path)
2227
if not os.path.isfile(dataset_filename):
2328
os.system("wget --no-check-certificate {}".format(dataset_download_url))
2429

25-
## Read gist-960-euclidean data from hdf5
26-
f = h5py.File('gist-960-euclidean.hdf5', 'r')
30+
# Read gist-960-euclidean data from hdf5
31+
f = h5py.File("gist-960-euclidean.hdf5", "r")
2732
print(list(f.keys()))
2833
training_data = f["train"]
2934
size = training_data.size
3035
records_num, dimensions = training_data.shape
3136

32-
## Create table for gist-960-euclidean
37+
# Create table for gist-960-euclidean
3338
id_field = {"name": "id", "dataType": "INT", "primaryKey": True}
3439
vec_field = {"name": "vector", "dataType": "VECTOR_FLOAT", "dimensions": dimensions}
3540
fields = [id_field, vec_field]
3641
status_code, response = client.create_table(table_name="benchmark", table_fields=fields)
3742

38-
## Insert 20000 data into table
39-
records_data = [ {"id": i, "vector": training_data[i].tolist()} for i in range(10000)]
43+
# Insert 20000 data into table
44+
records_data = [{"id": i, "vector": training_data[i].tolist()} for i in range(10000)]
4045
client.insert(table_name="benchmark", records=records_data)
4146

42-
## Insert all data into table
43-
indexs = [ i for i in range(0, records_num+10000, 50000)]
47+
# Insert all data into table
48+
indexs = [i for i in range(0, records_num + 10000, 50000)]
4449
print("Begin to insert all gist data into table ...")
45-
for i in range(len(indexs)-1):
46-
print("-"*20)
47-
start=datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
48-
print(indexs[i], indexs[i+1])
49-
records_data = [{"id": i, "vector": training_data[i].tolist()} for i in range(indexs[i], indexs[i+1])]
50+
for i in range(len(indexs) - 1):
51+
print("-" * 20)
52+
start = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
53+
print(indexs[i], indexs[i + 1])
54+
records_data = [
55+
{"id": i, "vector": training_data[i].tolist()}
56+
for i in range(indexs[i], indexs[i + 1])
57+
]
5058
client.insert(table_name="benchmark", records=records_data)
5159
end = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
5260
print("START:", start, "\nEND :", end)
5361

5462

55-
## Delete some data by ids
63+
# Delete some data by ids
5664
# client.delete(table_name="benchmark", ids=[300033, 600066])
5765
client.delete(table_name="benchmark", ids=[9999])
5866

5967

60-
## Rebuild ann graph, it will wait until rebuild is finished, wait time is depended on the amount of dataset
68+
# Rebuild ann graph, it will wait until rebuild is finished, wait time is depended on the amount of dataset
6169
client.rebuild()
6270

63-
## Query vectors
71+
# Query vectors
6472
query_field = "vector"
6573
query_vector = training_data[40000].tolist()
6674
response_fields = ["id"]
6775
limit = 2
6876

69-
status_code, response = client.query(table_name="benchmark", query_field=query_field, query_vector=query_vector, response_fields=response_fields, limit=limit, with_distance=True)
77+
status_code, response = client.query(
78+
table_name="benchmark",
79+
query_field=query_field,
80+
query_vector=query_vector,
81+
response_fields=response_fields,
82+
limit=limit,
83+
with_distance=True,
84+
)
7085
print("Response:", response)
7186

7287

73-
## Get
88+
# Get
7489
status_code, body = client.get(table_name="benchmark")
7590
print("Status Code:", status_code)
7691
print("Size of result gotten", len(body["result"]))
77-
78-

examples/hello_epsilla.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,30 @@
77
# 3. python3 simple_example.py
88
#
99

10+
import os
11+
1012
from pyepsilla import vectordb
1113

1214
# Connect to Epsilla VectorDB
1315
client = vectordb.Client(protocol="http", host="127.0.0.1", port="8888")
1416

15-
# You can also use Epsilla Cloud
16-
# client = vectordb.Client(protocol='https', host='demo.epsilla.com', port='443')
17+
18+
DB_NAME = os.getenv("DB_NAME", "MyDB")
19+
DB_PATH = os.getenv("DB_PATH", "/tmp/epsilla_demo")
20+
TABLE_NAME = os.getenv("TABLE_NAME", "MyTable")
21+
1722

1823
# Load DB with path
1924
# pay attention to change db_path to persistent volume for production environment
20-
status_code, response = client.load_db(db_name="MyDB", db_path="/data/epsilla_demo")
25+
status_code, response = client.load_db(db_name=DB_NAME, db_path=DB_PATH)
2126
print(response)
2227

2328
# Set DB to current DB
24-
client.use_db(db_name="MyDB")
29+
client.use_db(db_name=DB_NAME)
2530

2631
# Create a table with schema in current DB
2732
status_code, response = client.create_table(
28-
table_name="MyTable",
33+
table_name=TABLE_NAME,
2934
table_fields=[
3035
{"name": "ID", "dataType": "INT", "primaryKey": True},
3136
{"name": "Doc", "dataType": "STRING"},
@@ -40,7 +45,7 @@
4045

4146
# Insert new vector records into table
4247
status_code, response = client.insert(
43-
table_name="MyTable",
48+
table_name=TABLE_NAME,
4449
records=[
4550
{"ID": 1, "Doc": "Berlin", "Embedding": [0.05, 0.61, 0.76, 0.74]},
4651
{"ID": 2, "Doc": "London", "Embedding": [0.19, 0.81, 0.75, 0.11]},
@@ -53,7 +58,7 @@
5358

5459
# Query Vectors with specific response field
5560
status_code, response = client.query(
56-
table_name="MyTable",
61+
table_name=TABLE_NAME,
5762
query_field="Embedding",
5863
query_vector=[0.35, 0.55, 0.47, 0.94],
5964
response_fields=["Doc"],
@@ -62,32 +67,32 @@
6267

6368
# Query Vectors without specific response field, then it will return all fields
6469
status_code, response = client.query(
65-
table_name="MyTable",
70+
table_name=TABLE_NAME,
6671
query_field="Embedding",
6772
query_vector=[0.35, 0.55, 0.47, 0.94],
6873
limit=2,
6974
)
7075
print(response)
7176

7277
# Get Vectors
73-
status_code, response = client.get(table_name="MyTable", limit=2)
78+
status_code, response = client.get(table_name=TABLE_NAME, limit=2)
7479
print(response)
7580

7681
# Get Statistics
7782
status_code, response = client.statistics()
7883
print(response)
7984

8085
# Delete Vectors
81-
# status_code, response = client.delete(table_name="MyTable", ids=[3])
82-
status_code, response = client.delete(table_name="MyTable", primary_keys=[3, 4])
83-
# status_code, response = client.delete(table_name="MyTable", filter="Doc <> 'San Francisco'")
86+
# status_code, response = client.delete(table_name=TABLE_NAME, ids=[3])
87+
status_code, response = client.delete(table_name=TABLE_NAME, primary_keys=[3, 4])
88+
# status_code, response = client.delete(table_name=TABLE_NAME, filter="Doc <> 'San Francisco'")
8489
print(response)
8590

8691

8792
# Drop table
88-
# status_code, response = client.drop_table("MyTable")
93+
# status_code, response = client.drop_table(TABLE_NAME)
8994
# print(response)
9095

9196
# Unload db
92-
# status_code, response = client.unload_db("MyDB")
97+
# status_code, response = client.unload_db(DB_NAME)
9398
# print(response)

0 commit comments

Comments
 (0)