diff --git a/.gitignore b/.gitignore index 071f9a8..9928737 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,9 @@ mpnet/ baai/ whisper espeak-ng-data + +## LINUX IGNORE +env/ +__pycache__/ +*.log +*.out \ No newline at end of file diff --git a/README.md b/README.md index 0c362d1..12e4c8e 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,16 @@ Now python should be setup and running! However, there is still a few more steps Finally, download the Mistral 7B LLM from the following link and place it inside the `llm/scripts` directory alongside the python scripts used by Dot: [TheBloke/Mistral-7B-Instruct-v0.2-GGUF](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/blob/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf) +## Linux Installation +```bash +$ chmod +x setup_python.sh +$ ./setup_python.sh +``` +The above command will check if python is installed on your system and start the setup. +Once the setup is done, configure all the models by specifying their features in the config file located at `llm/config.ini` + +Finally, run `npm start` + That's it! If you follow these steps you should be able to get it all running, please let me know if you are facing any issues :) diff --git a/llm/config.ini b/llm/config.ini new file mode 100644 index 0000000..457dc4d --- /dev/null +++ b/llm/config.ini @@ -0,0 +1,2 @@ +[EMBEDDING MODEL CONFIG] +MODEL_NAME=sentence-transformers/all-mpnet-base-v2 \ No newline at end of file diff --git a/llm/requirements.txt b/llm/requirements.txt new file mode 100644 index 0000000..81e23e0 --- /dev/null +++ b/llm/requirements.txt @@ -0,0 +1,8 @@ +torch +langchain-community +faiss-cpu +huggingface_hub +llama-cpp-python -C cmake.args="-DLLAMA_BLAS=ON;-DLLAMA_BLAS_VENDOR=OpenBLAS;-DLLAMA_CUDA=ON" +pypdf +unstructured +docx2txt \ No newline at end of file diff --git a/llm/scripts/bigdot.py b/llm/scripts/bigdot.py index 89ef094..51b7ae9 100644 --- a/llm/scripts/bigdot.py +++ b/llm/scripts/bigdot.py @@ -1,14 +1,15 @@ -from langchain.prompts import PromptTemplate -from langchain.chains import LLMChain -from langchain.llms import LlamaCpp -from langchain.memory import ConversationBufferWindowMemory +from langchain_core.prompts.prompt import PromptTemplate +from langchain.chains.llm import LLMChain +from langchain_community.llms.llamacpp import LlamaCpp +from langchain.memory.buffer_window import ConversationBufferWindowMemory import sys import json import os + def read_config(config_path): try: - with open(config_path, 'r') as file: + with open(config_path, "r") as file: config = json.load(file) return config except FileNotFoundError: @@ -18,6 +19,7 @@ def read_config(config_path): print("Error decoding JSON from configuration file.") return {} + if __name__ == "__main__": if len(sys.argv) > 1: @@ -26,24 +28,27 @@ def read_config(config_path): folder_name = "Dot-Data" folder_path = os.path.join(documents_path, folder_name) if not os.path.exists(folder_path): - print('LLM NOT FOUND!') + print("LLM NOT FOUND!") os.makedirs(folder_path) model_path = os.path.join(folder_path, "Phi-3-mini-4k-instruct-q4.gguf") config_path = sys.argv[1].strip('"') # Remove any surrounding quotes config = read_config(config_path) - #print("Current Configuration:", config) + # print("Current Configuration:", config) # Setup configuration with defaults in case some settings are missing n_gpu_layers = -1 # Metal set to 1 is typically enough for Apple Silicon - n_batch = config.get('n_batch', 256) - max_tokens = config.get('max_tokens', 500) - temperature = config.get('big_dot_temperature', 0.7) - n_ctx = config.get('n_ctx', 4000) - initial_prompt = config.get('big_dot_prompt', "You are called Dot, You are a helpful and honest assistant.") - if 'ggufFilePath' in config and config['ggufFilePath'] is None: - del config['ggufFilePath'] # Removes the key if it's explicitly None - llm_model = config.get('ggufFilePath', model_path) + n_batch = config.get("n_batch", 256) + max_tokens = config.get("max_tokens", 500) + temperature = config.get("big_dot_temperature", 0.7) + n_ctx = config.get("n_ctx", 4000) + initial_prompt = config.get( + "big_dot_prompt", + "You are called Dot, You are a helpful and honest assistant.", + ) + if "ggufFilePath" in config and config["ggufFilePath"] is None: + del config["ggufFilePath"] # Removes the key if it's explicitly None + llm_model = config.get("ggufFilePath", model_path) # Initialize the LLM with the configuration llm = LlamaCpp( @@ -67,12 +72,7 @@ def read_config(config_path): Response:""" prompt = PromptTemplate.from_template(template) memory = ConversationBufferWindowMemory(memory_key="chat_history", k=2) - conversation = LLMChain( - llm=llm, - prompt=prompt, - verbose=False, - memory=memory - ) + conversation = LLMChain(llm=llm, prompt=prompt, verbose=False, memory=memory) def send_response(response): response_json = json.dumps({"result": response}) @@ -85,9 +85,10 @@ def send_response(response): if not user_input: break - result = conversation({"question": user_input})['text'] + result = conversation({"question": user_input})["text"] max_chunk_length = 1000 # Define max length for output chunks - chunks = [result[i:i + max_chunk_length] for i in range(0, len(result), max_chunk_length)] + chunks = [ + result[i : i + max_chunk_length] + for i in range(0, len(result), max_chunk_length) + ] send_response(chunks) - - diff --git a/llm/scripts/docdot.py b/llm/scripts/docdot.py index 303583c..5da25ae 100644 --- a/llm/scripts/docdot.py +++ b/llm/scripts/docdot.py @@ -1,22 +1,22 @@ import sys import json -from langchain import PromptTemplate -from langchain.chains import RetrievalQA -from langchain.embeddings import HuggingFaceEmbeddings -from langchain.vectorstores import FAISS -from langchain.document_loaders import PyPDFLoader, DirectoryLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler -from langchain.llms import LlamaCpp -from langchain import PromptTemplate -from langchain.callbacks.manager import CallbackManager +import torch +from langchain_core.prompts.prompt import PromptTemplate +from langchain.chains.retrieval_qa.base import RetrievalQA +from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings +from langchain_community.vectorstores.faiss import FAISS +from langchain_community.document_loaders.pdf import PyPDFLoader +from langchain_community.document_loaders.directory import DirectoryLoader +from langchain_text_splitters.character import RecursiveCharacterTextSplitter +from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler +from langchain_community.llms.llamacpp import LlamaCpp +from langchain_core.callbacks.manager import CallbackManager import os - def read_config(config_path): try: - with open(config_path, 'r') as file: + with open(config_path, "r") as file: config = json.load(file) return config except FileNotFoundError: @@ -27,67 +27,82 @@ def read_config(config_path): return {} - if __name__ == "__main__": if len(sys.argv) > 1: + device_type = ( + "cuda" + if torch.cuda.is_available() + else "mps" if torch.backends.mps.is_available() else "cpu" + ) + # Folder paths and model initialization documents_path = os.path.join(os.path.expanduser("~"), "Documents") folder_name = "Dot-Data" folder_path = os.path.join(documents_path, folder_name) if not os.path.exists(folder_path): - print('LLM NOT FOUND!') + print("LLM NOT FOUND!") os.makedirs(folder_path) model_path = os.path.join(folder_path, "Phi-3-mini-4k-instruct-q4.gguf") config_path = sys.argv[1].strip('"') # Remove any surrounding quotes config = read_config(config_path) - #print("Current Configuration:", config) + # print("Current Configuration:", config) # Setup configuration with defaults in case some settings are missing n_gpu_layers = -1 # Metal set to 1 is typically enough for Apple Silicon - n_batch = config.get('n_batch', 256) # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip. - max_tokens = config.get('max_tokens', 500) - temperature = config.get('big_dot_temperature', 0.7) - n_ctx = config.get('n_ctx', 4000) - initial_prompt = config.get('big_dot_prompt', "You are called Dot, You are a helpful and honest assistant.") - if 'ggufFilePath' in config and config['ggufFilePath'] is None: - del config['ggufFilePath'] # Removes the key if it's explicitly None - - llm_model = config.get('ggufFilePath', model_path) # This now only falls back to model_path if 'ggufFilePath' is not in config + n_batch = config.get( + "n_batch", 256 + ) # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip. + max_tokens = config.get("max_tokens", 500) + temperature = config.get("big_dot_temperature", 0.7) + n_ctx = config.get("n_ctx", 4000) + initial_prompt = config.get( + "big_dot_prompt", + "You are called Dot, You are a helpful and honest assistant.", + ) + if "ggufFilePath" in config and config["ggufFilePath"] is None: + del config["ggufFilePath"] # Removes the key if it's explicitly None + llm_model = config.get( + "ggufFilePath", model_path + ) # This now only falls back to model_path if 'ggufFilePath' is not in config current_directory = os.path.dirname(os.path.realpath(__file__)) - model_directory = os.path.join(current_directory, '..', 'baai') + model_directory = os.path.join(current_directory, "..", "baai") - #print("Model Directory:", os.path.abspath(model_directory)) + # print("Model Directory:", os.path.abspath(model_directory)) ### LOAD EMBEDDING SETTINGS - embeddings=HuggingFaceEmbeddings(model_name=model_directory, model_kwargs={'device':'mps'}) # SET TO 'cpu' for PC - vector_store = FAISS.load_local(os.path.join(folder_path, "Dot-data"), embeddings) - + embeddings = HuggingFaceEmbeddings( + model_name=model_directory, model_kwargs={"device": device_type} + ) # SET TO 'cpu' for PC + vector_store = FAISS.load_local( + os.path.join(folder_path, "Dot-data"), embeddings + ) # Find the current script's directory script_dir = os.path.dirname(__file__) - llm = LlamaCpp( model_path=llm_model, n_gpu_layers=n_gpu_layers, n_batch=n_batch, f16_kv=True, # MUST set to True, otherwise you will run into problem after a couple of calls ONLY FOR MAC max_tokens=max_tokens, - temperature= 0.01, + temperature=0.01, n_ctx=n_ctx, ) - DEFAULT_SYSTEM_PROMPT =""" + DEFAULT_SYSTEM_PROMPT = """ You are a good, honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you do not know the answer to a question, make it clear you do not know the answer instead of making up false information. """.strip() - def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str: + def generate_prompt( + prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT + ) -> str: return f""" [INST] <> {system_prompt} @@ -96,7 +111,7 @@ def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> {prompt} [/INST] """.strip() - SYSTEM_PROMPT ="Use the following pieces of context to answer the question at the end. If you do not know the answer, just say you don't know, don't try to make up an answer." + SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. If you do not know the answer, just say you don't know, don't try to make up an answer." template = generate_prompt( """ @@ -104,20 +119,22 @@ def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> Question: {question} """, - system_prompt = SYSTEM_PROMPT, + system_prompt=SYSTEM_PROMPT, ) + qa_prompt = PromptTemplate( + template=template, input_variables=["context", "question"] + ) - qa_prompt=PromptTemplate(template=template, input_variables=['context', 'question']) - - #start=timeit.default_timer() - - chain = RetrievalQA.from_chain_type(llm=llm, - chain_type='stuff', - retriever=vector_store.as_retriever(search_kwargs={'k': 2}), - return_source_documents=True, - chain_type_kwargs={'prompt': qa_prompt}) + # start=timeit.default_timer() + chain = RetrievalQA.from_chain_type( + llm=llm, + chain_type="stuff", + retriever=vector_store.as_retriever(search_kwargs={"k": 2}), + return_source_documents=True, + chain_type_kwargs={"prompt": qa_prompt}, + ) def format_response(dictionary): """ @@ -130,56 +147,62 @@ def format_response(dictionary): """ # Correctly define source_documents from the dictionary source_documents = dictionary["source_documents"] - + sources = "### Source Documents:\n" for doc in source_documents: # Safely get the 'source' and 'page' from metadata, default if not found source_path = doc.metadata.get("source", "Source path not available.") page_number = doc.metadata.get("page", "Page number not available.") - file_extension = source_path.split('.')[-1].lower() if source_path else "" - + file_extension = ( + source_path.split(".")[-1].lower() if source_path else "" + ) + # Metadata information metadata_info = f"**Source**: {source_path}\n**Page**: {page_number}\n" - if file_extension == 'pdf' and source_path != "Source path not available.": + if ( + file_extension == "pdf" + and source_path != "Source path not available." + ): source_path_with_page = f"{source_path}#page={page_number}" iframe_html = f'' sources += f"\n\n{metadata_info}\n{iframe_html}\n\n" - elif file_extension in ['doc', 'docx', 'xls', 'xlsx', 'ppt', 'pptx']: + elif file_extension in ["doc", "docx", "xls", "xlsx", "ppt", "pptx"]: # For Word, Excel, PowerPoint, display page_content text - page_content_text = doc.page_content.replace('\n', ' ') if doc.page_content else "Page content not available." + page_content_text = ( + doc.page_content.replace("\n", " ") + if doc.page_content + else "Page content not available." + ) sources += f"\n\n{metadata_info}\n{page_content_text}\n\n" else: # Fallback for other file types or if page_content should be displayed by default - page_content_text = doc.page_content.replace('\n', ' ') if doc.page_content else "Page content not available." + page_content_text = ( + doc.page_content.replace("\n", " ") + if doc.page_content + else "Page content not available." + ) sources += f"\n\n{metadata_info}\n{page_content_text}\n\n" - + # Now appending the formatted result at the end formatted_result = dictionary["result"] complete_response = sources + "\n\n---\n\n### Result:\n" + formatted_result - - return complete_response - - - - + return complete_response def chat(input_text): while True: - user_input=str(input_text) - query='ass' - if query=='exit': - print('Exiting') + user_input = str(input_text) + query = "ass" + if query == "exit": + print("Exiting") sys.exit() - if query=='': + if query == "": continue - result = chain({'query': user_input}) + result = chain({"query": user_input}) formatted_response = format_response(result) return formatted_response - - def send_response(response): # Convert the response to JSON response_json = json.dumps({"result": response}) @@ -190,7 +213,6 @@ def send_response(response): # Flush stdout to ensure the message is sent immediately sys.stdout.flush() - while True: # Read input continuously from stdin line = sys.stdin.readline().strip() @@ -202,11 +224,13 @@ def send_response(response): # Perform your processing on user_input result = chat(user_input) - + # Split the result into chunks of maximum length (e.g., 1000 characters) max_chunk_length = 1000 - chunks = [result[i:i + max_chunk_length] for i in range(0, len(result), max_chunk_length)] + chunks = [ + result[i : i + max_chunk_length] + for i in range(0, len(result), max_chunk_length) + ] # Send the chunks as an array send_response(chunks) - diff --git a/llm/scripts/embeddings.py b/llm/scripts/embeddings.py index e289486..a9b2aaf 100644 --- a/llm/scripts/embeddings.py +++ b/llm/scripts/embeddings.py @@ -1,16 +1,32 @@ import sys -from langchain.embeddings import HuggingFaceEmbeddings -from langchain.vectorstores import FAISS -from langchain.document_loaders import PyPDFLoader, DirectoryLoader, UnstructuredExcelLoader, TextLoader, UnstructuredPowerPointLoader, UnstructuredMarkdownLoader, Docx2txtLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter +import torch +from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings +from langchain_community.vectorstores.faiss import FAISS + +from langchain_community.document_loaders.pdf import PyPDFLoader +from langchain_community.document_loaders.directory import DirectoryLoader +from langchain_community.document_loaders.excel import UnstructuredExcelLoader +from langchain_community.document_loaders.text import TextLoader +from langchain_community.document_loaders.powerpoint import UnstructuredPowerPointLoader +from langchain_community.document_loaders.markdown import UnstructuredMarkdownLoader +from langchain_community.document_loaders.word_document import Docx2txtLoader +from langchain_text_splitters.character import RecursiveCharacterTextSplitter + +from .file_io.readers import ReadFile + import os import json -#import faiss +reader = ReadFile() +config_filepath = "config.ini" +config = reader.read_config(config_filepath) +EMB_MODEL_CONFIG_SECTION = "EMBEDDING MODEL CONFIG" +EMB_MODEL_NAME = config[EMB_MODEL_CONFIG_SECTION] + def read_config(config_path): try: - with open(config_path, 'r') as file: + with open(config_path, "r") as file: config = json.load(file) return config except FileNotFoundError: @@ -22,34 +38,49 @@ def read_config(config_path): if __name__ == "__main__": - print("Arguments received:", sys.argv) # This helps to debug the actual input received + print( + "Arguments received:", sys.argv + ) # This helps to debug the actual input received + + device_type = ( + "cuda" + if torch.cuda.is_available() + else "mps" if torch.backends.mps.is_available() else "cpu" + ) + if len(sys.argv) > 2: quotedDirectory = sys.argv[1] - config_path = sys.argv[2].strip('"') # Correctly reference config path as the second argument + config_path = sys.argv[2].strip( + '"' + ) # Correctly reference config path as the second argument config = read_config(config_path) else: print("Not enough arguments provided.") - - chunk_size = config.get('chunk_length', 4000) # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip. - chunk_overlap = config.get('chunk_overlap', 2000) # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip. + chunk_size = config.get( + "chunk_length", 4000 + ) # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip. + chunk_overlap = config.get( + "chunk_overlap", 2000 + ) # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip. print(chunk_size, chunk_overlap) def embeddings(chosen_directory): current_directory = os.path.dirname(os.path.realpath(__file__)) - model_directory = os.path.join(current_directory, '..', 'baai') + # model_directory = os.path.join(current_directory, "..", "baai") - print("Model Directory:", os.path.abspath(model_directory)) + # print("Model Directory:", os.path.abspath(model_directory)) ### LOAD EMBEDDING SETTINGS - embeddings=HuggingFaceEmbeddings(model_name=model_directory, model_kwargs={'device':'mps'}) # SET TO 'cpu' for PC + embeddings = HuggingFaceEmbeddings( + model_name=EMB_MODEL_NAME, model_kwargs={"device": device_type} + ) # SET TO 'cpu' for PC - text_splitter=RecursiveCharacterTextSplitter( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap) - + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, chunk_overlap=chunk_overlap + ) victor = FAISS.from_texts(["foo"], embeddings) @@ -67,75 +98,76 @@ def embeddings(chosen_directory): if not os.path.exists(folder_path): os.makedirs(folder_path) - directory = str(chosen_directory) ### PDF try: - #**Step 1: Load the PDF File from Data Path**** - loader1=DirectoryLoader(directory, - glob="*.pdf", - loader_cls=PyPDFLoader, - show_progress=True, - use_multithreading=True, - recursive=True) - + # **Step 1: Load the PDF File from Data Path**** + loader1 = DirectoryLoader( + directory, + glob="*.pdf", + loader_cls=PyPDFLoader, + show_progress=True, + use_multithreading=True, + recursive=True, + ) + documents_pdf = loader1.load() - text_chunks_pdf=text_splitter.split_documents(documents_pdf) + text_chunks_pdf = text_splitter.split_documents(documents_pdf) print(len(text_chunks_pdf)) - #**Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store*** - vector_store_pdf=FAISS.from_documents(text_chunks_pdf, embeddings) - #vector_store_pdf.save_local(os.path.join(folder_path, "Dot-data-pdf")) + # **Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store*** + vector_store_pdf = FAISS.from_documents(text_chunks_pdf, embeddings) + # vector_store_pdf.save_local(os.path.join(folder_path, "Dot-data-pdf")) victor.merge_from(vector_store_pdf) except Exception as error: print("NO PDFs FOUND" + str(error)) - - ### WORD - try: - loader2=DirectoryLoader(directory, - glob="*.docx", - loader_cls=Docx2txtLoader, - show_progress=True, - use_multithreading=True, - recursive=True) - + try: + loader2 = DirectoryLoader( + directory, + glob="*.docx", + loader_cls=Docx2txtLoader, + show_progress=True, + use_multithreading=True, + recursive=True, + ) + documents_word = loader2.load() - text_chunks_word=text_splitter.split_documents(documents_word) + text_chunks_word = text_splitter.split_documents(documents_word) print(len(text_chunks_word)) - #**Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store*** - vector_store_word=FAISS.from_documents(text_chunks_word, embeddings) - #vector_store_word.save_local(os.path.join(folder_path, "Dot-data-word")) + # **Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store*** + vector_store_word = FAISS.from_documents(text_chunks_word, embeddings) + # vector_store_word.save_local(os.path.join(folder_path, "Dot-data-word")) victor.merge_from(vector_store_word) except Exception as error: print("NO WORD DOCUMENTS FOUND" + str(error)) - - ### POWER POINT try: - loader3=DirectoryLoader(directory, - glob="*.pptx", - loader_cls=UnstructuredPowerPointLoader, - show_progress=True, - use_multithreading=True, - recursive=True) - + loader3 = DirectoryLoader( + directory, + glob="*.pptx", + loader_cls=UnstructuredPowerPointLoader, + show_progress=True, + use_multithreading=True, + recursive=True, + ) + documents_ppt = loader3.load() - text_chunks_ppt=text_splitter.split_documents(documents_ppt) + text_chunks_ppt = text_splitter.split_documents(documents_ppt) print(len(text_chunks_ppt)) - #**Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store*** - vector_store_ppt=FAISS.from_documents(text_chunks_ppt, embeddings) - #vector_store_ppt.save_local(os.path.join(folder_path, "Dot-data-ppt")) + # **Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store*** + vector_store_ppt = FAISS.from_documents(text_chunks_ppt, embeddings) + # vector_store_ppt.save_local(os.path.join(folder_path, "Dot-data-ppt")) victor.merge_from(vector_store_ppt) except Exception as error: @@ -143,21 +175,23 @@ def embeddings(chosen_directory): ### EXCEL try: - loader4=DirectoryLoader(directory, - glob="*.xlsx", - loader_cls=UnstructuredExcelLoader, - show_progress=True, - use_multithreading=True, - recursive=True) - + loader4 = DirectoryLoader( + directory, + glob="*.xlsx", + loader_cls=UnstructuredExcelLoader, + show_progress=True, + use_multithreading=True, + recursive=True, + ) + documents_xlsx = loader4.load() - text_chunks_xlsx=text_splitter.split_documents(documents_xlsx) + text_chunks_xlsx = text_splitter.split_documents(documents_xlsx) print(len(text_chunks_ppt)) - #**Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store*** - vector_store_xlsx=FAISS.from_documents(text_chunks_xlsx, embeddings) - #vector_store_ppt.save_local(os.path.join(folder_path, "Dot-data-ppt")) + # **Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store*** + vector_store_xlsx = FAISS.from_documents(text_chunks_xlsx, embeddings) + # vector_store_ppt.save_local(os.path.join(folder_path, "Dot-data-ppt")) victor.merge_from(vector_store_xlsx) except Exception as error: @@ -165,27 +199,28 @@ def embeddings(chosen_directory): # MARKDOWN try: - loader5=DirectoryLoader(directory, - glob="*.md", - loader_cls=UnstructuredMarkdownLoader, - show_progress=True, - use_multithreading=True, - recursive=True) - + loader5 = DirectoryLoader( + directory, + glob="*.md", + loader_cls=UnstructuredMarkdownLoader, + show_progress=True, + use_multithreading=True, + recursive=True, + ) + documents_md = loader5.load() - text_chunks_md=text_splitter.split_documents(documents_md) + text_chunks_md = text_splitter.split_documents(documents_md) print(len(text_chunks_md)) - #**Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store*** - vector_store_md=FAISS.from_documents(text_chunks_md, embeddings) - #vector_store_ppt.save_local(os.path.join(folder_path, "Dot-data-ppt")) + # **Step 4: Convert the Text Chunks into Embeddings and Create a FAISS Vector Store*** + vector_store_md = FAISS.from_documents(text_chunks_md, embeddings) + # vector_store_ppt.save_local(os.path.join(folder_path, "Dot-data-ppt")) victor.merge_from(vector_store_md) except Exception as error: print("NO MARKDOWN FOUND" + str(error)) - - + victor.save_local(os.path.join(folder_path, "Dot-data")) print("Usage: python your_script.py ") @@ -197,5 +232,3 @@ def embeddings(chosen_directory): print(f"Processing directory: {directory_path}") embeddings(directory_path) print("LESGOOOOOO") - - diff --git a/llm/scripts/file_io/__init__.py b/llm/scripts/file_io/__init__.py new file mode 100644 index 0000000..8d5475c --- /dev/null +++ b/llm/scripts/file_io/__init__.py @@ -0,0 +1,5 @@ +"""Initialize FILE IO""" + +import configparser +import json +import pickle diff --git a/llm/scripts/file_io/readers.py b/llm/scripts/file_io/readers.py new file mode 100644 index 0000000..5f9fa24 --- /dev/null +++ b/llm/scripts/file_io/readers.py @@ -0,0 +1,80 @@ +"""A Class that reads a file and returns its contents in the required data structure""" + +from . import pickle +from . import configparser +from . import json + + +class ReadFile: + """A Class that reads a file and returns its contents in the required data structure""" + + def read_txt(self, filepath: str, as_list: bool = False, delimiter: str = "\n"): + """## Read `.txt` Files to variables effortlessly. + + Args: + filepath (str): Path to your `.txt` file + as_list (bool, optional): Return file contents as a list. Defaults to False. + delimiter (str, optional): Delimiter to split the file contents to return as a list. Defaults to "\\n". + + Returns: + Any: `str` or `list` + """ + read_mode_string = "r" + with open(filepath, mode=read_mode_string, encoding="utf-8") as file_contents: + contents = file_contents.read() + + if as_list: + contents = contents.split(delimiter) + + return contents + + def read_pickle(self, filepath: str): + """## Read `.pickle` or `.pkl` Files. + + Args: + filepath (str): Path to your `.pickle` or `.pkl` file. + + Returns: + Any + """ + read_mode_string = "rb" + with open(filepath, read_mode_string) as file_contents: + contents = pickle.load(file_contents) + + return contents + + def read_config(self, filename: str) -> configparser.ConfigParser: + """Reads .ini file and returns the object + + Returns: + object: config object + """ + config = configparser.ConfigParser() + config.read(filename) + return config + + def read_json(self, filename: str) -> dict: + """Get JSON contents as a dictionary + + Args: + filename (str): Name of the JSON file to read contents from. + + Returns: + dict: Dictionary output of the JSON file. + """ + with open(filename) as file_contents: + json_dict = json.load(file_contents) + + return json_dict + + def read_jsonl(self, filepath: str) -> list: + """Get JSONL contents as a list + + Args: + filepath (str): Path of the JSONL file to read contents from. + + Returns: + list: List output of the JSONL file. + """ + with open(filepath, "r", encoding="utf-8") as jsonl_file: + return [json.loads(line) for line in jsonl_file] diff --git a/llm/scripts/file_io/writers.py b/llm/scripts/file_io/writers.py new file mode 100644 index 0000000..3710fb5 --- /dev/null +++ b/llm/scripts/file_io/writers.py @@ -0,0 +1,59 @@ +"""Write File""" + +from . import pickle +from . import json + + +class WriteFile: + """Write File Class""" + + def __init__(self): + """Initialize Write File Class""" + self.write_mode_string = "w" + self.write_binary_mode_string = "wb" + + def write_txt( + self, + filepath: str, + contents, + from_list: bool = True, + delimiter: str = "\n", + ): + """write to `.txt` file + + Args: + filepath (str) + contents (_type_) + from_list (bool, optional). Defaults to True. + delimiter (str, optional). Defaults to "\n". + + Returns: + None + """ + if from_list: + contents = delimiter.join(contents) + with open(filepath, mode=self.write_mode_string, encoding="utf-8") as txt_file: + txt_file.write(contents) + + def write_pickle(self, contents, filepath: str): + """Write as pickle + + Args: + contents (Any) + filepath (str) + """ + with open( + filepath, + self.write_binary_mode_string, + ) as pickle_file: + pickle.dump(contents, pickle_file) + + def write_json(self, dictionary: dict, filepath: str = "results.json"): + """Write as JSON + + Args: + dictionary (dict) + filepath (str, optional). Defaults to "results.json". + """ + with open(filepath, self.write_mode_string) as json_file: + json.dump(dictionary, json_file) diff --git a/src/index.js b/src/index.js index 46ae852..bffebe9 100644 --- a/src/index.js +++ b/src/index.js @@ -10,6 +10,7 @@ const { Worker } = require('worker_threads'); const isMac = process.platform === 'darwin' +const isLinux = process.platform === 'linux' let galleryViewInterval // Declare galleryViewInterval globally let ttsProcess; // Declare ttsProcess globally @@ -34,11 +35,26 @@ const template = [ ], }, ] - : []), + : isLinux ? [ + { + label: app.name, + submenu: [ + { role: 'about' }, + { type: 'separator' }, + { role: 'services' }, + { type: 'separator' }, + { role: 'hide' }, + { role: 'hideOthers' }, + { role: 'unhide' }, + { type: 'separator' }, + { role: 'quit' }, + ], + }, + ] : []), // { role: 'fileMenu' } { label: 'File', - submenu: [isMac ? { role: 'close' } : { role: 'quit' }], + submenu: [isMac ? { role: 'close' } : isLinux ? { role: 'close' } : { role: 'quit' }], }, // { role: 'editMenu' } { @@ -64,7 +80,19 @@ const template = [ ], }, ] - : [ + : isLinux ? [ + { role: 'pasteAndMatchStyle' }, + { role: 'delete' }, + { role: 'selectAll' }, + { type: 'separator' }, + { + label: 'Speech', + submenu: [ + { role: 'startSpeaking' }, + { role: 'stopSpeaking' }, + ], + }, + ] : [ { role: 'delete' }, { type: 'separator' }, { role: 'selectAll' }, @@ -100,7 +128,12 @@ const template = [ { type: 'separator' }, { role: 'window' }, ] - : [{ role: 'close' }]), + : isLinux ? [ + { type: 'separator' }, + { role: 'front' }, + { type: 'separator' }, + { role: 'window' }, + ] : [{ role: 'close' }]), ], }, { @@ -128,11 +161,15 @@ let pythonProcess // Declare pythonProcess globally function findPython() { const possibilities = [ // In packaged app - path.join(process.resourcesPath, 'llm', 'python', 'bin', 'python3'), + isMac ? path.join(process.resourcesPath, 'llm', 'python', 'bin', 'python3') + : isLinux ? path.join(process.resourcesPath, 'llm', 'env', 'bin', 'python3') + : path.join(process.resourcesPath, 'llm', 'python', 'bin', 'python3'), //WINDOWS: path.join(process.resourcesPath, 'llm', 'python', 'python.exe'), // In development - path.join(__dirname, '..', 'llm', 'python', 'bin', 'python3'), + isMac ? path.join(__dirname, '..', 'llm', 'python', 'bin', 'python3') + : isLinux ? path.join(__dirname, '..', 'llm', 'env', 'bin', 'python3') + : path.join(__dirname, '..', 'llm', 'python', 'bin', 'python3'), //WINDOWS: path.join(process.__dirname, 'llm', 'python', 'python.exe'), ] for (const path_to_python of possibilities) {