Initial commit.
This commit is contained in:
@@ -1,2 +1,7 @@
|
||||
# DeepSeek_PARTS
|
||||
DeepSeek model for PARTS Ltd
|
||||
|
||||
|
||||
# 1. Make sure Ollama is running
|
||||
# 2. Working command for open-webui:
|
||||
sudo docker run -d --network=host -v open-webui:/app/backend/data -e OLLAMA_BASE_URL=http://127.0.0.1:11434 --name open-webui --restart always ghcr.io/open-webui/open-webui:main
|
||||
0
datastores/__init__.py
Normal file
0
datastores/__init__.py
Normal file
BIN
datastores/__pycache__/__init__.cpython-310.pyc
Normal file
BIN
datastores/__pycache__/__init__.cpython-310.pyc
Normal file
Binary file not shown.
BIN
datastores/__pycache__/parts_ai_datastore.cpython-310.pyc
Normal file
BIN
datastores/__pycache__/parts_ai_datastore.cpython-310.pyc
Normal file
Binary file not shown.
42
datastores/parts_ai_datastore.py
Normal file
42
datastores/parts_ai_datastore.py
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import ClassVar
|
||||
|
||||
class Parts_Ai_DataStore:
|
||||
TRAINING_DATA_DIRECTORY: ClassVar[str] = "docs/training_data"
|
||||
|
||||
training_data: ClassVar[object]
|
||||
training_data_directory: ClassVar[str]
|
||||
|
||||
def __init__(self, training_data_directory):
|
||||
self.training_data_directory = training_data_directory
|
||||
|
||||
@classmethod
|
||||
def make_default(cls):
|
||||
return cls(cls.TRAINING_DATA_DIRECTORY)
|
||||
|
||||
def load_training_data(self):
|
||||
data = []
|
||||
for root, dirs, files in os.walk(self.TRAINING_DATA_DIRECTORY):
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
if file.endswith(".json"):
|
||||
data.append({
|
||||
"type": "json"
|
||||
, "content": self.load_json_data(file_path)
|
||||
})
|
||||
elif file.endswith(".txt"):
|
||||
data.append({
|
||||
"type": "txt"
|
||||
, "content": self.load_txt_data(file_path)
|
||||
})
|
||||
return data
|
||||
@staticmethod
|
||||
def load_json_data(file_path):
|
||||
with open(file_path) as file:
|
||||
return json.load(file)
|
||||
@staticmethod
|
||||
def load_txt_data(file_path):
|
||||
with open(file_path) as file:
|
||||
return file.read()
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
21
docs/training_data/miscellaneous/partsltd.json
Normal file
21
docs/training_data/miscellaneous/partsltd.json
Normal file
@@ -0,0 +1,21 @@
|
||||
[
|
||||
{"input": "What is our company name?", "output": "Our company name is Precision And Research Technology Systems Limited, a.k.a. PARTS Ltd."},
|
||||
{"input": "What is our company's mission?", "output": "Our mission is to provide software engineering services. Our main product is PARTS Enterprise, an ERP system with an optional supplemental WooCommerce store based on ERPNext."},
|
||||
{"input": "What is our main product offering?", "output": "Our main product is PARTS Enterprise, an ERP system with an optional supplemental WooCommerce store based on ERPNext. The upfront cost for migration and purchase is upwards of £10k and the monthly subscription for hosting and support is £200."},
|
||||
{"input": "What is our company's target market?", "output": "Our target market is small to medium-sized businesses in the UK, with a focus on the manufacturing and distribution sectors."},
|
||||
{"input": "What is our company's unique selling point?", "output": "Our unique selling point is that we are the only company in the UK that offers a fully integrated ERP system with a WooCommerce store based on ERPNext."},
|
||||
{"input": "What is our company's competitive advantage?", "output": "Our competitive advantage is that we are a small company with low overheads, so we can offer a more personalised service at a lower cost than our competitors."},
|
||||
{"input": "What is our company's pricing strategy?", "output": "Our pricing strategy is to charge a one-time fee for migration and purchase, followed by a monthly subscription for hosting and support."},
|
||||
{"input": "What is our company's customer service policy?", "output": "Our customer service policy is to provide ticket-based support via email and web portal between 09:00-17:00 Monday-Friday, with a guaranteed response time of 24 hours 90% of the time."},
|
||||
{"input": "What is our company's environmental policy?", "output": "Our environmental policy is to minimise our carbon footprint by working remotely and using renewable energy sources."},
|
||||
{"input": "What is our company's legal status?", "output": "Our company is a private limited company."},
|
||||
{"input": "What is our company's vision?", "output": "Our vision is to be the leading provider of software engineering services in the world."},
|
||||
{"input": "What is our company's location?", "output": "Our company is located in the founder's home address: 53 Alfred Green Close, Rugby, CV22 6DN, United Kingdom."},
|
||||
{"input": "What is our company's phone number?", "output": "Our company's phone number is +447375 571430."},
|
||||
{"input": "What is our company's email address?", "output": "Our company's email address is teddy@partsltd.co.uk"},
|
||||
{"input": "What is our company's website?", "output": "Our company's website is https://partsltd.co.uk."},
|
||||
{"input": "What is our company's logo?", "output": "Our company's logo is a 3D wireframe of a simple light microscope, above a galaxy background."},
|
||||
{"input": "What is our company's core values?", "output": "Our company's core values are integrity, excellence, innovation, and teamwork."},
|
||||
{"input": "What is our company's history?", "output": "Our company was founded in 2021 by Lord Edward Middleton-Smith. We started as an electromechanical engineering services company, but have since pivoted to software engineering services in order to work remotely."},
|
||||
{"input": "Who is the founder of our company?", "output": "The founder of our company, and the only employee, is Lord Edward Middleton-Smith."}
|
||||
]
|
||||
BIN
helpers/__pycache__/ai_helper.cpython-310.pyc
Normal file
BIN
helpers/__pycache__/ai_helper.cpython-310.pyc
Normal file
Binary file not shown.
4
helpers/ai_helper.py
Normal file
4
helpers/ai_helper.py
Normal file
@@ -0,0 +1,4 @@
|
||||
class Ai_Helper:
|
||||
@staticmethod
|
||||
def get_pretrained_model_path(model_name, is_fine_tuned = False):
|
||||
return f"./pretrained_models/{model_name}{'-finetuned' if is_fine_tuned else ''}"
|
||||
55
main.py
Normal file
55
main.py
Normal file
@@ -0,0 +1,55 @@
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
|
||||
from typing import ClassVar
|
||||
from datasets import Dataset
|
||||
|
||||
class AiTrainerParts:
|
||||
DEFAULT_MODEL_NAME: ClassVar[str] = "deepseek-ai/DeepSeek-V3"
|
||||
|
||||
model: ClassVar[AutoModelForCausalLM]
|
||||
model_name: ClassVar[str]
|
||||
tokenizer: ClassVar[AutoTokenizer]
|
||||
trainer: ClassVar[Trainer]
|
||||
|
||||
@classmethod
|
||||
def make_default(cls):
|
||||
return cls(cls.DEFAULT_MODEL_NAME)
|
||||
|
||||
def download_model_and_tokenizer(self):
|
||||
self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
|
||||
def save_model_and_tokenizer(self):
|
||||
path = f"./pretrained_models/{self.model_name}"
|
||||
self.model.save_pretrained(path)
|
||||
self.tokenizer.save_pretrained(path)
|
||||
|
||||
@staticmethod
|
||||
def tokenizer_preprocess_function(examples):
|
||||
return tokenizer(examples["input"], truncation=True, padding="max_length", max_length=512)
|
||||
def tokenize_training_data():
|
||||
data = Dataset.from_dict({
|
||||
"input": ["What is our company's mission?", "Who is the CEO?"],
|
||||
"output": ["Our mission is to...", "The CEO is John Doe."]
|
||||
})
|
||||
tokenized_data = data.map(AiTrainerParts.tokenizer_preprocess_function, batched=True)
|
||||
return tokenized_data
|
||||
|
||||
def fine_tune_model(self):
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./deepseek-v3-finetuned",
|
||||
per_device_train_batch_size=4,
|
||||
num_train_epochs=3,
|
||||
logging_dir="./logs",
|
||||
save_steps=10_000,
|
||||
save_total_limit=2,
|
||||
)
|
||||
trainer = Trainer(
|
||||
model=self.model,
|
||||
args=training_args,
|
||||
train_dataset=self.training_data,
|
||||
)
|
||||
trainer.train()
|
||||
def save_fine_tuned_model(self):
|
||||
self.trainer.save_model("./deepseek-v3-finetuned")
|
||||
self.tokenizer.save_pretrained("./deepseek-v3-finetuned")
|
||||
36
parts_ai_assistant.py
Normal file
36
parts_ai_assistant.py
Normal file
@@ -0,0 +1,36 @@
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
||||
from typing import ClassVar
|
||||
|
||||
from DeepSeek_PARTS.helpers.ai_helper import Ai_Helper
|
||||
|
||||
class Parts_Ai_Assistant:
|
||||
DEFAULT_MODEL_NAME: ClassVar[str] = "deepseek-ai/DeepSeek-V3"
|
||||
|
||||
generator: ClassVar[pipeline]
|
||||
model: ClassVar[AutoModelForCausalLM]
|
||||
model_name: ClassVar[str]
|
||||
tokenizer: ClassVar[AutoTokenizer]
|
||||
|
||||
def __init__(self, model_name):
|
||||
self.model_name = model_name
|
||||
self.load_pretrained_model()
|
||||
|
||||
@classmethod
|
||||
def make_default(cls):
|
||||
return cls(cls.DEFAULT_MODEL_NAME)
|
||||
|
||||
def load_pretrained_model(self):
|
||||
path = Ai_Helper.get_pretrained_model_path(self.model_name, is_fine_tuned = True)
|
||||
self.model = AutoModelForCausalLM.from_pretrained(path)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(path)
|
||||
self.generator = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)
|
||||
|
||||
def get_response(self, prompt, conversation_history = []):
|
||||
complete_prompt = "\n".join(conversation_history + [prompt])
|
||||
return self.generator(complete_prompt)
|
||||
|
||||
if __name__ == "__main__":
|
||||
assistant = Parts_Ai_Assistant.make_default()
|
||||
response = assistant.get_response("What is our company's mission?")
|
||||
print(response)
|
||||
95
parts_ai_trainer.py
Normal file
95
parts_ai_trainer.py
Normal file
@@ -0,0 +1,95 @@
|
||||
|
||||
from datasets import Dataset
|
||||
import json
|
||||
import os
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
|
||||
from typing import ClassVar
|
||||
|
||||
from datastores.parts_ai_datastore import Parts_Ai_DataStore
|
||||
from helpers.ai_helper import Ai_Helper
|
||||
|
||||
class Parts_Ai_Trainer:
|
||||
DEFAULT_MODEL_NAME: ClassVar[str] = "deepseek-ai/DeepSeek-V3"
|
||||
TRAINING_DATA_DIRECTORY: ClassVar[str] = "docs/training_data"
|
||||
|
||||
model: ClassVar[AutoModelForCausalLM]
|
||||
model_name: ClassVar[str]
|
||||
tokenizer: ClassVar[AutoTokenizer]
|
||||
trainer: ClassVar[Trainer]
|
||||
training_data: ClassVar[object]
|
||||
|
||||
def __init__(self, model_name):
|
||||
self.model_name = model_name
|
||||
self.download_model_and_tokenizer()
|
||||
@classmethod
|
||||
def make_default(cls):
|
||||
return cls(cls.DEFAULT_MODEL_NAME)
|
||||
|
||||
def download_model_and_tokenizer(self):
|
||||
self.model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=True)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||
def save_model_and_tokenizer(self):
|
||||
path = Ai_Helper.get_pretrained_model_path(self.model_name, is_fine_tuned = False)
|
||||
self.model.save_pretrained(path)
|
||||
self.tokenizer.save_pretrained(path)
|
||||
|
||||
def train_and_save_model(self):
|
||||
self.load_and_preprocess_training_data()
|
||||
self.fine_tune_model()
|
||||
self.save_fine_tuned_model()
|
||||
def load_and_preprocess_training_data(self):
|
||||
self.load_training_data()
|
||||
self.preprocess_data()
|
||||
self.tokenize_preprocessed_data()
|
||||
def load_training_data(self):
|
||||
datastore = Parts_Ai_DataStore.make_default()
|
||||
return datastore.load_training_data()
|
||||
def preprocess_data(self):
|
||||
processed_data = []
|
||||
for item in self.training_data:
|
||||
if item["type"] == "json":
|
||||
for key, value in item["content"].items():
|
||||
processed_data.append({
|
||||
"input": key
|
||||
, "output": value
|
||||
})
|
||||
elif item["type"] == "txt":
|
||||
first_line, contents = item["content"].split("\n", 1) if "\n" in item["content"] else (item["content"], "")
|
||||
processed_data.append({
|
||||
"input": first_line
|
||||
, "output": contents
|
||||
})
|
||||
self.training_data = Dataset.from_dict({
|
||||
"input": [item["input"] for item in processed_data],
|
||||
"output": [item["output"] for item in processed_data]
|
||||
})
|
||||
def tokenize_preprocessed_data(self):
|
||||
self.training_data = self.training_data.map(self.tokenizer_preprocess_function, batched=True)
|
||||
@staticmethod
|
||||
def tokenizer_preprocess_function(self, examples):
|
||||
return self.tokenizer(examples["input"], truncation = True, padding = "max_length", max_length = 512)
|
||||
def fine_tune_model(self):
|
||||
training_args = TrainingArguments(
|
||||
output_dir = Ai_Helper.get_pretrained_model_path(self.model_name, is_fine_tuned = True),
|
||||
per_device_train_batch_size = 4,
|
||||
num_train_epochs = 3,
|
||||
logging_dir = "./logs",
|
||||
save_steps = 10_000,
|
||||
save_total_limit = 2,
|
||||
)
|
||||
trainer = Trainer(
|
||||
model=self.model,
|
||||
args=training_args,
|
||||
train_dataset=self.training_data,
|
||||
)
|
||||
trainer.train()
|
||||
def save_fine_tuned_model(self):
|
||||
path = Ai_Helper.get_pretrained_model_path(self.model_name, is_fine_tuned = True)
|
||||
self.trainer.save_model(path)
|
||||
self.tokenizer.save_pretrained(path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
trainer = Parts_Ai_Trainer.make_default()
|
||||
trainer.train_and_save_model()
|
||||
|
||||
|
||||
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
transformers
|
||||
Reference in New Issue
Block a user