I'm trying to fine-tune GPT-2 so that it can produce artificial data similar to this:
I need to change my contact info
I need to go back
Sorry i need to go back
go back
I need to go back
need to change name
I gave the wrong phone number
contact info
I put the wrong age down
I just realized I made a mistake
I put the wrong availability
I'm actually available Mondays and Fridays
I actually can work part time
Sorry I made a mistake earlier
I need to go back
go back
Please need to go back
I made a mistake
I first create a torch dataset that feeds the training data
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv
class goBackDataset(Dataset):
def __init__(self, goback_dataset_path = 'goback.txt'):
super().__init__()
self.goback_list = []
self.end_of_text_token = "<EOS>"
self.beginning_of_text_token = "<SOS>"
with open(goback_dataset_path) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
x = 0
for row in csv_reader:
goback_str = f"{self.beginning_of_text_token} {row[0]} {self.end_of_text_token}"
self.goback_list.append(goback_str)
def __len__(self):
return len(self.goback_list)
def __getitem__(self, item):
return self.goback_list[item]
I then initialize the dataset and tokenizers, adding beginning of text and end of text tokens as well as a padding token to ensure all sequences are the same length (as I assume this is necessary?)
import torch
from transformers import GPT2LMHeadModel, AutoTokenizer, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import TensorDataset
dataset = goBackDataset()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
tokenizer.add_special_tokens({'bos_token': dataset.beginning_of_text_token, 'eos_token':dataset.end_of_text_token, 'pad_token':"<PAD>"})
model = GPT2LMHeadModel.from_pretrained('gpt2-large')
model.resize_token_embeddings(len(tokenizer))
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
I then train the model, in batch sizes of 4 (chosen arbitrarily based on the number of samples I have):
epochs = 5
learning_rate = 5e-4
epsilon = 1e-8
optimizer = AdamW(model.parameters(),
lr = learning_rate,
eps = epsilon
)
goback_loader = DataLoader(dataset, batch_size=4, shuffle=True)
import torch.nn as nn
softmax = nn.Softmax()
epochs = 2
max_length = 50
model.train()
for epoch in range(epochs):
combined_loss = 0
print(f'epoch {epoch}')
for batch in goback_loader:
print(batch)
tokenized = tokenizer(batch, truncation = True, max_length = max_length, padding = 'max_length')
input_ids = torch.tensor(tokenized['input_ids'][0])
attention_mask = torch.tensor(tokenized['attention_mask'][0])
model.zero_grad()
outputs = model(input_ids, labels = input_ids, attention_mask = attention_mask)
loss = outputs[0]
combined_loss += loss.item()
loss.backward()
optimizer.step()
print('loss: ', combined_loss)
Yet the desired result is a gibberish of padding tokens and words like "change my" strewn in.
The best I could do is by enforcing a limit on the number of repeat, enforcing a small max length, and adding a high temperature, yet it's still a mess:
tokenizer.decode(model.generate(inputs = None,
do_sample = True,
max_length = 10,
no_repeat_ngram_size = 1,
temperature = 0.9,
skip_special_tokens = True)[0])
'<|endoftext|>Could <PAD> change my interviewI <EOS> back schedule'
Is GPT-2 not robust enough to handle this task with this amount of data, or am I doing something wrong? It's clearly not ending with <EOS>
tokens, and starting with <|endoftext|>
and including <PAD>
. Is this even expected behavior?