From 457abd23e7ff8da321e46ad6d2d8a3d5a2f2c4b5 Mon Sep 17 00:00:00 2001 From: Teddy Middleton-Smith Date: Thu, 20 Feb 2025 11:56:58 +0000 Subject: [PATCH] Initial commit. --- README.md | 5 + datastores/__init__.py | 0 .../__pycache__/__init__.cpython-310.pyc | Bin 0 -> 186 bytes .../parts_ai_datastore.cpython-310.pyc | Bin 0 -> 1787 bytes datastores/parts_ai_datastore.py | 42 + ...e_parts_enterprise_hosting_and_support.txt | 1080 ++++++++++++++ ..._parts_enterprise_purchase_and_install.txt | 1244 +++++++++++++++++ .../training_data/miscellaneous/partsltd.json | 21 + helpers/__pycache__/ai_helper.cpython-310.pyc | Bin 0 -> 595 bytes helpers/ai_helper.py | 4 + main.py | 55 + parts_ai_assistant.py | 36 + parts_ai_trainer.py | 95 ++ requirements.txt | 1 + 14 files changed, 2583 insertions(+) create mode 100644 datastores/__init__.py create mode 100644 datastores/__pycache__/__init__.cpython-310.pyc create mode 100644 datastores/__pycache__/parts_ai_datastore.cpython-310.pyc create mode 100644 datastores/parts_ai_datastore.py create mode 100644 docs/training_data/contracts/example_parts_enterprise_hosting_and_support.txt create mode 100644 docs/training_data/contracts/example_parts_enterprise_purchase_and_install.txt create mode 100644 docs/training_data/miscellaneous/partsltd.json create mode 100644 helpers/__pycache__/ai_helper.cpython-310.pyc create mode 100644 helpers/ai_helper.py create mode 100644 main.py create mode 100644 parts_ai_assistant.py create mode 100644 parts_ai_trainer.py create mode 100644 requirements.txt diff --git a/README.md b/README.md index 725ad84..6a812ec 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,7 @@ # DeepSeek_PARTS DeepSeek model for PARTS Ltd + + +# 1. Make sure Ollama is running +# 2. Working command for open-webui: +sudo docker run -d --network=host -v open-webui:/app/backend/data -e OLLAMA_BASE_URL=http://127.0.0.1:11434 --name open-webui --restart always ghcr.io/open-webui/open-webui:main \ No newline at end of file diff --git a/datastores/__init__.py b/datastores/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datastores/__pycache__/__init__.cpython-310.pyc b/datastores/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b45e38fbb07465f80819e8c07268035e91145153 GIT binary patch literal 186 zcmd1j<>g`k0+0ID=^*+sh(HF6K#l_t7qb9~6oz01O-8?!3`HPe1o10YKO;XkRlg)P zC8bi|B|o_|H#M)MSU;dBKfNe1H#ajcUB95RBqKjh-z7D*AUHKOJ3hcMC?pu2o03?P pSX`1{lv=DGAD@|*SrQ+wS5SG2!zMRBr8Fni4rFaH6OdqG003)YFN^>H literal 0 HcmV?d00001 diff --git a/datastores/__pycache__/parts_ai_datastore.cpython-310.pyc b/datastores/__pycache__/parts_ai_datastore.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18a99c4bfd52460a8754091b4599d5b064d14cd9 GIT binary patch literal 1787 zcmZ`(&2J+$6t_K|WHOy3RTlv(BzBh%p>Rk9LP$szqH55pmrykYw9z7Kh}|}6GLzb_ zN~4K{)ZV$TIHbpl|HxNP{0E4`!h23z(r)XK-@Nzy{QP?#c3!Q<1jegZf9y9yLjFWy zc?2MQ1Jg8tNFwQo^cH{Gqad>p8wb6BlIKJQlAjaF&sdL3+9i$fFW5sm4eFxJQEJQ& zsqUj@dEEFPf>sgq*<;_sIlVw}6+R}aB-uIvzU=wH^9wy+Co+_!bJB}cMV4iBPR?jA zmK7O;Zbc<>MJ8TX1->e4p0CL|U|V%-+o`rjYz#y@wQ1MpT20qv-Zw3)(?K@K_JxE6 z_s;Il##U#m^IWtyb~i+OYvm6WHgcHTc8tIV3#w$Ar;IvxYu*5uS4 zGTAb&eRQjj6krAu@f)*wXge=g{cj1O%=Fc~fxr-w>NwU-``Y#lU<^V#&tJ?Agm z-ok+AL@<4lr@-u!8MR~%sUETm+9p4J{xX=cSuo=T2kFyUXvr{`^MV!O1)G(Ma2`r} zL3N{`bLhW<%*#dTg391`*iXq(-CJ<5_{v*^U=iK6IEo6q1EtP1Yz|GHIgS)dYfmT2 zmHT;Sp@LK1w7*yv0BKYl%Z=kF>FCfW$?-{Q4`5?H$Xui{X-)1)g)a7HbQ4%Xi@sI5Zys;q+}9 z(He~@rwq9Gb<7uB^qT$3{^rT_<9jOhrpH@#!@jwx+m|3%=$$kATNSED=d6H={0Pk@ z35V|t-kdJ^jF`7hpd$FafQc7r(Px3nj2@+H$xK+x|a!8`ST~~zNRBD8E7w%)weFF0e zZVBN&eF*M#M|2u>yxdtjRzkQ~h;c5Dq4>fjLL42Zqs5AEJ&m<+fw9`vg-Ek3x2YZE znE~rk{__x?Hobx-D}8*~#|mB^sjHyTPf?v1wC`W(F}oTuFU8>tfR|neKrrnF&0?}j z6PD13u^^^Vz*b|*f*SQ!jnJ1!1xwE z#$J63PfklvByp1Y`KDhx^EGQU>I5x%c^`jZ{IFo4+w7MHi_KBiQ%?x`5c_Th6>h4ISVFE=X z*210(M^JW}3tta>k|n?mNIB=dESe>d8qJ3)8>=u$;&jaOxtU}s?*Zfiz>^qu@9u*` zIB~i(4`Z<{+xfgW8-o#tyNWzC6DwKU&|-`8pKodZ%OCi(@0W|JaB_qw>cT3YO#$m; zA)aU9)C#poo6y8j0=TtQM?0mznpMU2Cg9ImmAh&8 GZ<23wdz453 literal 0 HcmV?d00001 diff --git a/helpers/ai_helper.py b/helpers/ai_helper.py new file mode 100644 index 0000000..7a61d06 --- /dev/null +++ b/helpers/ai_helper.py @@ -0,0 +1,4 @@ +class Ai_Helper: + @staticmethod + def get_pretrained_model_path(model_name, is_fine_tuned = False): + return f"./pretrained_models/{model_name}{'-finetuned' if is_fine_tuned else ''}" \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..0096fb9 --- /dev/null +++ b/main.py @@ -0,0 +1,55 @@ + +from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments +from typing import ClassVar +from datasets import Dataset + +class AiTrainerParts: + DEFAULT_MODEL_NAME: ClassVar[str] = "deepseek-ai/DeepSeek-V3" + + model: ClassVar[AutoModelForCausalLM] + model_name: ClassVar[str] + tokenizer: ClassVar[AutoTokenizer] + trainer: ClassVar[Trainer] + + @classmethod + def make_default(cls): + return cls(cls.DEFAULT_MODEL_NAME) + + def download_model_and_tokenizer(self): + self.model = AutoModelForCausalLM.from_pretrained(self.model_name) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + def save_model_and_tokenizer(self): + path = f"./pretrained_models/{self.model_name}" + self.model.save_pretrained(path) + self.tokenizer.save_pretrained(path) + + @staticmethod + def tokenizer_preprocess_function(examples): + return tokenizer(examples["input"], truncation=True, padding="max_length", max_length=512) + def tokenize_training_data(): + data = Dataset.from_dict({ + "input": ["What is our company's mission?", "Who is the CEO?"], + "output": ["Our mission is to...", "The CEO is John Doe."] + }) + tokenized_data = data.map(AiTrainerParts.tokenizer_preprocess_function, batched=True) + return tokenized_data + + def fine_tune_model(self): + training_args = TrainingArguments( + output_dir="./deepseek-v3-finetuned", + per_device_train_batch_size=4, + num_train_epochs=3, + logging_dir="./logs", + save_steps=10_000, + save_total_limit=2, + ) + trainer = Trainer( + model=self.model, + args=training_args, + train_dataset=self.training_data, + ) + trainer.train() + def save_fine_tuned_model(self): + self.trainer.save_model("./deepseek-v3-finetuned") + self.tokenizer.save_pretrained("./deepseek-v3-finetuned") \ No newline at end of file diff --git a/parts_ai_assistant.py b/parts_ai_assistant.py new file mode 100644 index 0000000..5a81c37 --- /dev/null +++ b/parts_ai_assistant.py @@ -0,0 +1,36 @@ + +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline +from typing import ClassVar + +from DeepSeek_PARTS.helpers.ai_helper import Ai_Helper + +class Parts_Ai_Assistant: + DEFAULT_MODEL_NAME: ClassVar[str] = "deepseek-ai/DeepSeek-V3" + + generator: ClassVar[pipeline] + model: ClassVar[AutoModelForCausalLM] + model_name: ClassVar[str] + tokenizer: ClassVar[AutoTokenizer] + + def __init__(self, model_name): + self.model_name = model_name + self.load_pretrained_model() + + @classmethod + def make_default(cls): + return cls(cls.DEFAULT_MODEL_NAME) + + def load_pretrained_model(self): + path = Ai_Helper.get_pretrained_model_path(self.model_name, is_fine_tuned = True) + self.model = AutoModelForCausalLM.from_pretrained(path) + self.tokenizer = AutoTokenizer.from_pretrained(path) + self.generator = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer) + + def get_response(self, prompt, conversation_history = []): + complete_prompt = "\n".join(conversation_history + [prompt]) + return self.generator(complete_prompt) + +if __name__ == "__main__": + assistant = Parts_Ai_Assistant.make_default() + response = assistant.get_response("What is our company's mission?") + print(response) \ No newline at end of file diff --git a/parts_ai_trainer.py b/parts_ai_trainer.py new file mode 100644 index 0000000..e662bfa --- /dev/null +++ b/parts_ai_trainer.py @@ -0,0 +1,95 @@ + +from datasets import Dataset +import json +import os +from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments +from typing import ClassVar + +from datastores.parts_ai_datastore import Parts_Ai_DataStore +from helpers.ai_helper import Ai_Helper + +class Parts_Ai_Trainer: + DEFAULT_MODEL_NAME: ClassVar[str] = "deepseek-ai/DeepSeek-V3" + TRAINING_DATA_DIRECTORY: ClassVar[str] = "docs/training_data" + + model: ClassVar[AutoModelForCausalLM] + model_name: ClassVar[str] + tokenizer: ClassVar[AutoTokenizer] + trainer: ClassVar[Trainer] + training_data: ClassVar[object] + + def __init__(self, model_name): + self.model_name = model_name + self.download_model_and_tokenizer() + @classmethod + def make_default(cls): + return cls(cls.DEFAULT_MODEL_NAME) + + def download_model_and_tokenizer(self): + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=True) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + def save_model_and_tokenizer(self): + path = Ai_Helper.get_pretrained_model_path(self.model_name, is_fine_tuned = False) + self.model.save_pretrained(path) + self.tokenizer.save_pretrained(path) + + def train_and_save_model(self): + self.load_and_preprocess_training_data() + self.fine_tune_model() + self.save_fine_tuned_model() + def load_and_preprocess_training_data(self): + self.load_training_data() + self.preprocess_data() + self.tokenize_preprocessed_data() + def load_training_data(self): + datastore = Parts_Ai_DataStore.make_default() + return datastore.load_training_data() + def preprocess_data(self): + processed_data = [] + for item in self.training_data: + if item["type"] == "json": + for key, value in item["content"].items(): + processed_data.append({ + "input": key + , "output": value + }) + elif item["type"] == "txt": + first_line, contents = item["content"].split("\n", 1) if "\n" in item["content"] else (item["content"], "") + processed_data.append({ + "input": first_line + , "output": contents + }) + self.training_data = Dataset.from_dict({ + "input": [item["input"] for item in processed_data], + "output": [item["output"] for item in processed_data] + }) + def tokenize_preprocessed_data(self): + self.training_data = self.training_data.map(self.tokenizer_preprocess_function, batched=True) + @staticmethod + def tokenizer_preprocess_function(self, examples): + return self.tokenizer(examples["input"], truncation = True, padding = "max_length", max_length = 512) + def fine_tune_model(self): + training_args = TrainingArguments( + output_dir = Ai_Helper.get_pretrained_model_path(self.model_name, is_fine_tuned = True), + per_device_train_batch_size = 4, + num_train_epochs = 3, + logging_dir = "./logs", + save_steps = 10_000, + save_total_limit = 2, + ) + trainer = Trainer( + model=self.model, + args=training_args, + train_dataset=self.training_data, + ) + trainer.train() + def save_fine_tuned_model(self): + path = Ai_Helper.get_pretrained_model_path(self.model_name, is_fine_tuned = True) + self.trainer.save_model(path) + self.tokenizer.save_pretrained(path) + +if __name__ == "__main__": + trainer = Parts_Ai_Trainer.make_default() + trainer.train_and_save_model() + + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..747b7aa --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +transformers \ No newline at end of file