0

I'm trying to fine-tune GPT-2 so that it can produce artificial data similar to this:

I need to change my contact info
I need to go back
Sorry i need to go back
go back
I need to go back
need to change name
I gave the wrong phone number
contact info
I put the wrong age down
I just realized I made a mistake
I put the wrong availability
I'm actually available Mondays and Fridays
I actually can work part time
Sorry I made a mistake earlier
I need to go back
go back
Please need to go back
I made a mistake

I first create a torch dataset that feeds the training data

from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv

class goBackDataset(Dataset):
    def __init__(self, goback_dataset_path = 'goback.txt'):
        super().__init__()

        self.goback_list = []
        self.end_of_text_token = "<EOS>"
        self.beginning_of_text_token = "<SOS>"
        
        with open(goback_dataset_path) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            
            x = 0
            for row in csv_reader:
                goback_str = f"{self.beginning_of_text_token} {row[0]} {self.end_of_text_token}"
                self.goback_list.append(goback_str)
        
    def __len__(self):
        return len(self.goback_list)

    def __getitem__(self, item):
        return self.goback_list[item]

I then initialize the dataset and tokenizers, adding beginning of text and end of text tokens as well as a padding token to ensure all sequences are the same length (as I assume this is necessary?)

import torch
from transformers import GPT2LMHeadModel, AutoTokenizer, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import TensorDataset

dataset = goBackDataset()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
tokenizer.add_special_tokens({'bos_token': dataset.beginning_of_text_token, 'eos_token':dataset.end_of_text_token, 'pad_token':"<PAD>"})
model = GPT2LMHeadModel.from_pretrained('gpt2-large')
model.resize_token_embeddings(len(tokenizer))

import numpy as np
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

I then train the model, in batch sizes of 4 (chosen arbitrarily based on the number of samples I have):

epochs = 5
learning_rate = 5e-4
epsilon = 1e-8

optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

goback_loader = DataLoader(dataset, batch_size=4, shuffle=True)

import torch.nn as nn
softmax = nn.Softmax()

epochs = 2

max_length = 50
model.train()

for epoch in range(epochs):
    combined_loss = 0
    print(f'epoch {epoch}')
    for batch in goback_loader:
        print(batch)
        tokenized = tokenizer(batch, truncation = True, max_length = max_length, padding = 'max_length')
        input_ids = torch.tensor(tokenized['input_ids'][0])
        attention_mask = torch.tensor(tokenized['attention_mask'][0])
        
        model.zero_grad()        
        
        outputs = model(input_ids, labels = input_ids, attention_mask = attention_mask)
        loss = outputs[0]
        combined_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    print('loss: ', combined_loss)

Yet the desired result is a gibberish of padding tokens and words like "change my" strewn in.

The best I could do is by enforcing a limit on the number of repeat, enforcing a small max length, and adding a high temperature, yet it's still a mess:

tokenizer.decode(model.generate(inputs = None,
                                do_sample = True, 
                                max_length = 10, 
                                no_repeat_ngram_size = 1,
                                temperature = 0.9,
                                skip_special_tokens = True)[0])
'<|endoftext|>Could <PAD>  change my interviewI <EOS>  back schedule'

Is GPT-2 not robust enough to handle this task with this amount of data, or am I doing something wrong? It's clearly not ending with <EOS> tokens, and starting with <|endoftext|> and including <PAD>. Is this even expected behavior?

0 Answers0