네이버 영화 리뷰 감정 분석 데이터세트를 활용해 분류 모델을 학습
#네이버 영화 리뷰 데이터 불러오기
import numpy as np
import pandas as pd
from Korpora import Korpora
corpus = Korpora.load("nsmc")
df = pd.DataFrame(corpus.test).sample(20000, random_state=42)
train, valid, test = np.split(
df.sample(frac=1, random_state=42), [int(0.6 * len(df)), int(0.8 * len(df))]
)
print(train.head(5).to_markdown())
print(f"Training Data Size : {len(train)}")
print(f"Validation Data Size : {len(valid)}")
print(f"Testing Data Size : {len(test)}")
#BERT 입력 텐서 생성
import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
def make_dataset(data, tokenizer, device):
tokenized = tokenizer(
text=data.text.tolist(),
padding="longest",
truncation=True,
return_tensors="pt"
)
input_ids = tokenized["input_ids"].to(device)
attention_mask = tokenized["attention_mask"].to(device)
labels = torch.tensor(data.label.values, dtype=torch.long).to(device)
return TensorDataset(input_ids, attention_mask, labels)
def get_datalodader(dataset, sampler, batch_size):
data_sampler = sampler(dataset)
dataloader = DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
return dataloader
epochs = 5
batch_size = 32
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = BertTokenizer.from_pretrained(
pretrained_model_name_or_path="bert-base-multilingual-cased",
do_lower_case=False
)
train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_datalodader(train_dataset, RandomSampler, batch_size)
valid_dataset = make_dataset(valid, tokenizer, device)
valid_dataloader = get_datalodader(valid_dataset, SequentialSampler, batch_size)
test_dataset = make_dataset(test, tokenizer, device)
test_dataloader = get_datalodader(test_dataset, SequentialSampler, batch_size)
print(train_dataset[0])
#BERT 모델 선언
from torch import optim
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(
pretrained_model_name_or_path="bert-base-multilingual-cased",
num_labels=2
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5, eps=1e-8)