LocalRAG/utils.py at main · VijaySamant4368/LocalRAG · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from langchain_community.embeddings import OllamaEmbeddings
import re
import os
import json

SESSIONS_FILE = os.path.join(os.path.dirname(__file__), "sessions.json")
PARENT_DIR = os.path.join(os.path.dirname(__file__), "..")

def get_embedding_function():
    return OllamaEmbeddings(model="nomic-embed-text")

def strip_think(text: str) -> str:
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()

def resolve_path(path: str) -> str:
    if os.path.isabs(path):
        return path
    return os.path.normpath(os.path.join(PARENT_DIR, path))

def load_sessions():
    if not os.path.exists(SESSIONS_FILE):
        return []
    with open(SESSIONS_FILE, "r", encoding="utf-8") as f:
        return json.load(f).get("sessions", [])

def save_sessions(sessions):
    with open(SESSIONS_FILE, "w", encoding="utf-8") as f:
        json.dump({"sessions": sessions}, f, indent=2, ensure_ascii=False)


from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_community.document_loaders import (
    PyPDFDirectoryLoader,
    PyPDFLoader,
    TextLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredPowerPointLoader,
    CSVLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredFileLoader,   # fallback
)

import tempfile

# For now, just text stuffs (.txt, .pdf, .csv, .md)
# (If you are a developer reading this (LOL, like someone would ACTUALLY read my code), you have to install each of the other required library, then uncomment the respective file)

LOADERS = {
    ".pdf": PyPDFLoader,
    ".txt": TextLoader,
    # ".docx": UnstructuredWordDocumentLoader,
    # ".pptx": UnstructuredPowerPointLoader,
    ".csv": CSVLoader,
    # ".html": UnstructuredHTMLLoader,
    # ".htm": UnstructuredHTMLLoader,
    ".md": UnstructuredMarkdownLoader,
}

def get_loader(file_path: str, ext:str):
    loader_cls = LOADERS.get(ext)
    #fallback for everything else
    if loader_cls is None:
        return UnstructuredFileLoader(file_path)

    return loader_cls(file_path)

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

def add_to_chroma(chunks: list[Document], db):

    chunks_with_ids = calculate_chunk_ids(chunks)

    existing_items = db.get(include=[])  #IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        # print(f"Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
    else:
        ...
        print("No new documents to add")

def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page") or 0
        current_page_id = f"{source}:{page}"

        #If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        chunk.metadata["id"] = chunk_id

    return chunks

def add_files_to_db(files:list[Document], db):
    documents = []
    for file in files:
        # print(file.read())
        ext = os.path.splitext(file.filename)[1].lower()

        with tempfile.NamedTemporaryFile(
            delete=False,
            suffix=ext
        ) as tmp:
            file.save(tmp.name)
            temp_path = tmp.name
        try:
            loader = get_loader(temp_path, ext)

            if loader:
                docs = loader.load()
                for doc in docs:
                    doc.metadata["source"] = file.filename
                documents.extend(docs)

        finally:
            if os.path.exists(temp_path):
                os.remove(temp_path)

    chunks = split_documents(documents)
    add_to_chroma(chunks, db)
    print("Added files")
    ...