kobert로 간단한 다중 분류 모델 만들기
Kobert란
우선 BERT란 pretrained-model로 2018년에 위대한 구글에서 개발한 언어 모델인데, NLP 전반적인 분야에 아주 좋은 성능을 보여주는 모델이라고 합니다.
이런 BERT 모델을 한국어 기반으로 제작한 것이 바로 KoBERT 입니다.
kobert는 SKTBrain에서 공개한 기계번역 모델입니다.
!pip install mxnet
!pip install gluonnlp
#깃허브에서 KoBERT 파일 로드
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm
#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
데이터셋 준비
네이버 영화 리뷰 데이터셋
!wget https://www.dropbox.com/s/374ftkec978br3d/ratings_train.txt?dl=1
!wget https://www.dropbox.com/s/977gbwh542gdy94/ratings_test.txt?dl=1
train = nlp.data.TSVDataset("ratings_train.txt?dl=1", field_indices=[1,2], num_discard_samples=1)
test = nlp.data.TSVDataset("ratings_test.txt?dl=1", field_indices=[1,2], num_discard_samples=1)
train[0]
['아 더빙.. 진짜 짜증나네요 목소리', '0']
train[1]
['흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1']
Kobert
# 파라미터 설정
max_len = 64 # 텍스트 데이터 최대 길이
batch_size = 64
warmup_ratio = 0.1
num_epochs = 2
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
데이터가 KoBERT 모델의 입력으로 들어갈 수 있는 형태가 되도록 토큰화, 정수 인코딩, 패딩 등을 해주어야 한다.
BERTSentenceTransform로 토큰화와 패딩을 실행
class BERTDataset(Dataset):
def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
pad, pair):
transform = nlp.data.BERTSentenceTransform(
bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
self.sentences = [transform([i[sent_idx]]) for i in dataset]
self.labels = [np.int32(i[label_idx]) for i in dataset]
def __getitem__(self, i):
return (self.sentences[i] + (self.labels[i], ))
def __len__(self):
return (len(self.labels))
#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
data_train = BERTDataset(train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(test, 0, 1, tok, max_len, True, False)
/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████] /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████] using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
data_train[0]
(array([ 2, 3093, 1698, 6456, 517, 54, 517, 54, 4368, 4396, 7316, 5655, 5703, 2073, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32), array(15, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), 0)
배열 중에서
첫 번째는 패딩된 시퀀스,
두 번째는 길이와 타입에 대한 내용,
세 번재는 어텐션 마스크 시퀀스이다.
어텐션 마스크란 BERT에 데이터가 입력되었을 때 어텐션 함수가 적용되어 연산이 된다. 이때 1로 패딩된 값들은 연산을 하지 않아도 된다고 알려주는 데이터가 있어야 하는데 그게 바로 어텐션 마스크 시퀀스인 것이다.
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=2)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=2)
모델
class 수 만큼 num_classes를 지정
class BERTClassifier(nn.Module):
def __init__(self,
bert,
hidden_size = 768,
num_classes=2, # 클래스 수 조정
dr_rate=None,
params=None):
super(BERTClassifier, self).__init__()
self.bert = bert
self.dr_rate = dr_rate
self.classifier = nn.Linear(hidden_size , num_classes)
if dr_rate:
self.dropout = nn.Dropout(p=dr_rate)
def gen_attention_mask(self, token_ids, valid_length):
attention_mask = torch.zeros_like(token_ids)
for i, v in enumerate(valid_length):
attention_mask[i][:v] = 1
return attention_mask.float()
def forward(self, token_ids, valid_length, segment_ids):
attention_mask = self.gen_attention_mask(token_ids, valid_length)
_, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
if self.dr_rate:
out = self.dropout(pooler)
return self.classifier(out)
#BERT 모델 불러오기
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
#optimizer와 schedule 설정
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning FutureWarning,
#정확도 측정을 위한 함수 정의
def calc_accuracy(X,Y):
max_vals, max_indices = torch.max(X, 1)
train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
return train_acc
Train
for e in range(num_epochs):
train_acc = 0.0
test_acc = 0.0
model.train()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)):
optimizer.zero_grad()
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
loss = loss_fn(out, label)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
optimizer.step()
scheduler.step() # Update learning rate schedule
train_acc += calc_accuracy(out, label)
if batch_id % log_interval == 0:
print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
model.eval()
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
test_acc += calc_accuracy(out, label)
print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
0%| | 1/2344 [00:01<55:46, 1.43s/it]
epoch 1 batch id 1 loss 0.6700725555419922 train acc 0.53125
9%|▊ | 201/2344 [03:58<42:13, 1.18s/it]
epoch 1 batch id 201 loss 0.3991697132587433 train acc 0.6117070895522388
17%|█▋ | 401/2344 [07:55<38:17, 1.18s/it]
epoch 1 batch id 401 loss 0.4272489547729492 train acc 0.7094763092269327
26%|██▌ | 601/2344 [11:52<34:26, 1.19s/it]
epoch 1 batch id 601 loss 0.4097176492214203 train acc 0.7530678036605657
34%|███▍ | 801/2344 [15:49<30:24, 1.18s/it]
epoch 1 batch id 801 loss 0.5139654278755188 train acc 0.7778753121098627
43%|████▎ | 1001/2344 [19:45<26:32, 1.19s/it]
epoch 1 batch id 1001 loss 0.31333789229393005 train acc 0.7924107142857143
51%|█████ | 1201/2344 [23:42<22:33, 1.18s/it]
epoch 1 batch id 1201 loss 0.29457372426986694 train acc 0.8034320358034971
60%|█████▉ | 1401/2344 [27:39<18:36, 1.18s/it]
epoch 1 batch id 1401 loss 0.36195579171180725 train acc 0.8107601713062098
68%|██████▊ | 1601/2344 [31:36<14:39, 1.18s/it]
epoch 1 batch id 1601 loss 0.41905707120895386 train acc 0.817340724547158
77%|███████▋ | 1801/2344 [35:33<10:44, 1.19s/it]
epoch 1 batch id 1801 loss 0.24425488710403442 train acc 0.8234140755136036
85%|████████▌ | 2001/2344 [39:30<06:46, 1.19s/it]
epoch 1 batch id 2001 loss 0.27006250619888306 train acc 0.8284763868065967
94%|█████████▍| 2201/2344 [43:27<02:49, 1.19s/it]
epoch 1 batch id 2201 loss 0.3236289620399475 train acc 0.8327961721944571
100%|██████████| 2344/2344 [46:16<00:00, 1.18s/it]
epoch 1 train acc 0.8359397219852104
100%|██████████| 782/782 [05:40<00:00, 2.30it/s]
epoch 1 test acc 0.8834718670076727
0%| | 1/2344 [00:01<50:00, 1.28s/it]
epoch 2 batch id 1 loss 0.4443327784538269 train acc 0.84375
9%|▊ | 201/2344 [03:58<42:23, 1.19s/it]
epoch 2 batch id 201 loss 0.21954092383384705 train acc 0.8803638059701493
17%|█▋ | 401/2344 [07:55<38:26, 1.19s/it]
epoch 2 batch id 401 loss 0.2106774002313614 train acc 0.8858322942643392
26%|██▌ | 601/2344 [11:53<34:43, 1.20s/it]
epoch 2 batch id 601 loss 0.3419431149959564 train acc 0.8890650998336106
34%|███▍ | 801/2344 [15:52<30:41, 1.19s/it]
epoch 2 batch id 801 loss 0.3492409288883209 train acc 0.891522315855181
43%|████▎ | 1001/2344 [19:51<26:48, 1.20s/it]
epoch 2 batch id 1001 loss 0.24526211619377136 train acc 0.8937624875124875
51%|█████ | 1201/2344 [23:50<22:45, 1.19s/it]
epoch 2 batch id 1201 loss 0.2076842039823532 train acc 0.8957509367194005
60%|█████▉ | 1401/2344 [27:50<18:48, 1.20s/it]
epoch 2 batch id 1401 loss 0.21389761567115784 train acc 0.8973278015703069
68%|██████▊ | 1601/2344 [31:49<14:49, 1.20s/it]
epoch 2 batch id 1601 loss 0.2874751389026642 train acc 0.8993402560899438
77%|███████▋ | 1801/2344 [35:49<10:51, 1.20s/it]
epoch 2 batch id 1801 loss 0.12477239966392517 train acc 0.9012961549139367
85%|████████▌ | 2001/2344 [39:48<06:50, 1.20s/it]
epoch 2 batch id 2001 loss 0.2274707704782486 train acc 0.9034545227386307
94%|█████████▍| 2201/2344 [43:48<02:50, 1.19s/it]
epoch 2 batch id 2201 loss 0.20406942069530487 train acc 0.9049224784189005
100%|██████████| 2344/2344 [46:38<00:00, 1.19s/it]
epoch 2 train acc 0.9061277908134243
100%|██████████| 782/782 [05:43<00:00, 2.27it/s]
epoch 2 test acc 0.8938618925831202
Test
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
model.eval()
output=[]
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
valid_length= valid_length
label = label.long().to(device)
out = model(token_ids, valid_length, segment_ids)
for i in out:
logits=i
logits = logits.detach().cpu().numpy()
output.append(logits)
using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
100%|██████████| 782/782 [05:45<00:00, 2.27it/s]
result = np.argmax(output, axis=1)
result
array([1, 0, 1, ..., 0, 0, 0])
댓글남기기