Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| from datasets import Dataset | |
| import datasets | |
| from tqdm import tqdm | |
| from transformers import AutoTokenizer | |
| from langchain.docstore.document import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores.utils import DistanceStrategy | |
| knowledge_base = datasets.load_dataset("gaia-benchmark/GAIA", '2023_level1', split='test') | |
| print(knowledge_base.column_names) | |
| # ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata'] | |
| source_docs = [ | |
| Document( | |
| page_content=doc["Question"], | |
| metadata={ | |
| "task_id": doc["task_id"], | |
| "level": doc["Level"], | |
| "final_answer": doc["Final answer"], | |
| "file_name": doc["file_name"], | |
| "file_path": doc["file_path"], | |
| "annotator_metadata": doc["Annotator Metadata"], | |
| }, | |
| ) | |
| for doc in knowledge_base | |
| ] | |
| text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( | |
| AutoTokenizer.from_pretrained("thenlper/gte-small"), | |
| chunk_size=200, | |
| chunk_overlap=20, | |
| add_start_index=True, | |
| strip_whitespace=True, | |
| separators=["\n\n", "\n", ".", " ", ""], | |
| ) | |
| # Split docs and keep only unique ones | |
| print("Splitting documents...") | |
| docs_processed = [] | |
| unique_texts = {} | |
| for doc in tqdm(source_docs): | |
| new_docs = text_splitter.split_documents([doc]) | |
| for new_doc in new_docs: | |
| if new_doc.page_content not in unique_texts: | |
| unique_texts[new_doc.page_content] = True | |
| docs_processed.append(new_doc) | |
| print("Embedding documents... This should take a few minutes (5 minutes on MacBook with M1 Pro)") | |
| embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small") | |
| vectordb = FAISS.from_documents( | |
| documents=docs_processed, | |
| embedding=embedding_model, | |
| distance_strategy=DistanceStrategy.COSINE, | |
| ) | |
| if __name__ == "__main__": | |
| # print(dataset) | |
| # ds = Dataset.from_dict(dataset) | |
| # dataset = ds.with_format("pandas") | |
| print(vectordb) | |