[docs]classLanceDBRetriever(Retriever[Any,RetrieverStrQueryType]):__doc__=r""" LanceDBRetriever is a retriever that leverages LanceDB to efficiently store and query document embeddings. Args: embedder (Embedder): An instance of the Embedder class used for computing embeddings. dimensions (int): The dimensionality of the embeddings used. db_uri (str): The URI of the LanceDB storage (default is "/tmp/lancedb"). top_k (int): The number of top results to retrieve for a given query (default is 5). overwrite (bool): If True, the existing table is overwritten; otherwise, new documents are appended. This retriever supports adding documents with their embeddings to a LanceDB storage and retrieving relevant documents based on a given query. More information on LanceDB can be found here:(https://github.com/lancedb/lancedb) Documentations: https://lancedb.github.io/lancedb/ """def__init__(self,embedder:Embedder,dimensions:int,db_uri:str="/tmp/lancedb",top_k:int=5,overwrite:bool=True,):super().__init__()self.db=lancedb.connect(db_uri)self.embedder=embedderself.top_k=top_kself.dimensions=dimensionsschema=pa.schema([pa.field("vector",pa.list_(pa.float32(),list_size=self.dimensions)),pa.field("content",pa.string()),])self.table=self.db.create_table("documents",schema=schema,mode="overwrite"ifoverwriteelse"append")
[docs]defadd_documents(self,documents:Sequence[Dict[str,Any]]):""" Adds documents with and computes their embeddings using the provided Embedder. Args: documents (Sequence[Dict[str, Any]]): A sequence of documents, each with a 'content' field containing text. """ifnotdocuments:log.warning("No documents provided for embedding")return# Embed document content using Embedderdoc_texts=[doc["content"]fordocindocuments]embeddings=self.embedder(input=doc_texts).data# Format embeddings for LanceDBdata=[{"vector":embedding.embedding,"content":text}forembedding,textinzip(embeddings,doc_texts)]self.table.add(data)log.info(f"Added {len(documents)} documents to the index")
[docs]defretrieve(self,query:Union[str,List[str]],top_k:Optional[int]=None)->List[RetrieverOutput]:""". Retrieve top-k documents from LanceDB for a given query or queries. Args: query (Union[str, List[str]]): A query string or a list of query strings. top_k (Optional[int]): The number of top documents to retrieve (if not specified, defaults to the instance's top_k). Returns: List[RetrieverOutput]: A list of RetrieverOutput containing the indices and scores of the retrieved documents. """ifisinstance(query,str):query=[query]ifnotqueryor(isinstance(query,str)andquery.strip()==""):raiseValueError("Query cannot be empty.")ifnotself.table:raiseValueError("The index has not been initialized or the table is missing.")query_embeddings=self.embedder(input=query).dataoutput:List[RetrieverOutput]=[]# Perform search in LanceDB for each queryforquery_embinquery_embeddings:results=(self.table.search(query_emb.embedding).limit(top_korself.top_k).to_pandas())# Gather indices and scores from search resultsindices=results.index.tolist()scores=results["_distance"].tolist()# Append results to outputoutput.append(RetrieverOutput(doc_indices=indices,doc_scores=scores,query=query[0]iflen(query)==1elsequery,))returnoutput