<aside> 💡 허깅 페이스 트랜스포머스 라이브러리 GPT-2 모델로 문장 생
</aside>
#문장 생성을 위한 GPT-2 모델의 구조
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path="gpt2")
for main_name, main_module in model.named_children():
print(main_name)
for sub_name, sub_module in main_module.named_children():
print("└", sub_name)
for ssub_name, ssub_module in sub_module.named_children():
print("│ └", ssub_name)
for sssub_name, sssub_module in ssub_module.named_children():
print("│ │ └", sssub_name)
#GPT-2를 이용한 문장 생성
from transformers import pipeline
generator = pipeline(task="text-generation", model="gpt2")
outputs = generator(
text_inputs="Machine learning is",
max_length=20,
num_return_sequences=3,
pad_token_id=generator.tokenizer.eos_token_id
)
print(outputs)
#CoLA 데이터세트 불러오기
import torch
from torchtext.datasets import CoLA
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
def collator(batch, tokenizer, device):
source, labels, texts = zip(*batch)
tokenized = tokenizer(
texts,
padding="longest",
truncation=True,
return_tensors="pt"
)
input_ids = tokenized["input_ids"].to(device)
attention_mask = tokenized["attention_mask"].to(device)
labels = torch.tensor(labels, dtype=torch.long).to(device)
return input_ids, attention_mask, labels
train_data = list(CoLA(split="train"))
valid_data = list(CoLA(split="dev"))
test_data = list(CoLA(split="test"))
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
epochs = 3
batch_size = 16
device = "cuda" if torch.cuda.is_available() else "cpu"
train_dataloader = DataLoader(
train_data,
batch_size=batch_size,
collate_fn=lambda x: collator(x, tokenizer, device),
shuffle=True,
)
valid_dataloader = DataLoader(
valid_data, batch_size=batch_size, collate_fn=lambda x: collator(x, tokenizer, device)
)
test_dataloader = DataLoader(
test_data, batch_size=batch_size, collate_fn=lambda x: collator(x, tokenizer, device)
)
print("Train Dataset Length :", len(train_data))
print("Valid Dataset Length :", len(valid_data))
print("Test Dataset Length :", len(test_data))
#GPT-2 모델 설정
from torch import optim
from transformers import GPT2ForSequenceClassification
model = GPT2ForSequenceClassification.from_pretrained(
pretrained_model_name_or_path="gpt2",
num_labels=2
).to(device)
model.config.pad_token_id = model.config.eos_token_id
optimizer = optim.Adam(model.parameters(), lr=5e-5)
#GPT-2 모델 학습 및 검증
import numpy as np
from torch import nn
def calc_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def train(model, optimizer, dataloader):
model.train()
train_loss = 0.0
for input_ids, attention_mask, labels in dataloader:
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
loss = outputs.loss
train_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss = train_loss / len(dataloader)
return train_loss
def evaluation(model, dataloader):
with torch.no_grad():
model.eval()
criterion = nn.CrossEntropyLoss()
val_loss, val_accuracy = 0.0, 0.0
for input_ids, attention_mask, labels in dataloader:
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
logits = outputs.logits
loss = criterion(logits, labels)
logits = logits.detach().cpu().numpy()
label_ids = labels.to("cpu").numpy()
accuracy = calc_accuracy(logits, label_ids)
val_loss += loss
val_accuracy += accuracy
val_loss = val_loss/len(dataloader)
val_accuracy = val_accuracy/len(dataloader)
return val_loss, val_accuracy
best_loss = 10000
for epoch in range(epochs):
train_loss = train(model, optimizer, train_dataloader)
val_loss, val_accuracy = evaluation(model, valid_dataloader)
print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Val Accuracy {val_accuracy:.4f}")
if val_loss < best_loss:
best_loss = val_loss
torch.save(model.state_dict(), "../models/GPT2ForSequenceClassification.pt")
print("Saved the model weights")
#모델 평가
model = GPT2ForSequenceClassification.from_pretrained(
pretrained_model_name_or_path="gpt2",
num_labels=2
).to(device)
model.config.pad_token_id = model.config.eos_token_id
model.load_state_dict(torch.load("../models/GPT2ForSequenceClassification.pt"))
test_loss, test_accuracy = evaluation(model, test_dataloader)
print(f"Test Loss : {test_loss:.4f}")
print(f"Test Accuracy : {test_accuracy:.4f}")