4. Text classification
Here we use Word2Vec to generate word embeddings and LSTM to classify them from a DBPedia dataset.
The Word2Vec is trained using gensim and the LSTM side is done using PyTorch.
[1]:
#!pip install pytorch_lightning mlflow torchtext gensim
[1]:
import pickle
import os
import itertools
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import gensim
import torchtext
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import random_split, TensorDataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
%matplotlib inline
/home/marco/.local/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
4.1. Word2Vec
[2]:
text_corpus = [
"humans can watch TV",
"houses can not watch TV",
"humans can talk about houses",
"refrigerators can not talk",
"humans think about strange things",
"beds do not think about strange things",
"I see interesting things about the next TV",
"Do not think lightly about humans",
"The next major might think about things in a different way about the TV",
"This place is not unsafe for humans",
"Yellow houses are good for humans",
"Those are not safes houses at all",
"Humans have houses",
"Houses are blue",
"Do not think about houses",
"There is one TV inside those houses"
]
class ProcessedText:
def __init__(self, text):
self.text = text
def __iter__(self):
for line in self.text:
yield gensim.utils.simple_preprocess(line)
[3]:
wv_model = gensim.models.Word2Vec(vector_size=20, window=5, min_count=1, workers=4)
wv_model.build_vocab(ProcessedText(text_corpus), update=False)
wv_model.train(ProcessedText(text_corpus), total_examples=wv_model.corpus_count, epochs=100)
[3]:
(2074, 9400)
[4]:
# example of vector for a word
wv_model.wv['humans']
[4]:
array([-0.05358737, 0.02408984, -0.04244986, -0.02269858, 0.00658333,
-0.00125561, -0.02724232, 0.06634317, 0.00881555, 0.04800051,
-0.03760158, 0.02025081, -0.01671168, -0.00771837, 0.04606754,
-0.02332198, 0.03516515, -0.03691765, -0.03200763, 0.03627926],
dtype=float32)
[5]:
# find most similar words given a set of words
wv_model.wv.most_similar(['humans', 'can'], topn=10)
[5]:
[('not', 0.4467436969280243),
('strange', 0.4427088797092438),
('way', 0.37376558780670166),
('at', 0.3642667531967163),
('the', 0.36400458216667175),
('next', 0.3263437747955322),
('major', 0.22556430101394653),
('think', 0.20717817544937134),
('about', 0.20420345664024353),
('things', 0.177797332406044)]
[6]:
wv_model.wv.most_similar('can', topn=10)
[6]:
[('watch', 0.40134578943252563),
('at', 0.37023916840553284),
('next', 0.3551013469696045),
('things', 0.35425886511802673),
('those', 0.31279364228248596),
('refrigerators', 0.3126063644886017),
('not', 0.2820398807525635),
('major', 0.2547941207885742),
('all', 0.21463869512081146),
('blue', 0.20638896524906158)]
4.2. Doc2Vec
[7]:
class ProcessedDocs:
def __init__(self, text):
self.text = text
def __iter__(self):
for i, line in enumerate(self.text):
doc_tags = ['human' if 'human' in line else 'thing']
yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), doc_tags)
[8]:
dv_model = gensim.models.Doc2Vec()
dv_model.build_vocab(ProcessedDocs(text_corpus), update=False)
dv_model.train(ProcessedDocs(text_corpus), total_examples=dv_model.corpus_count, epochs=100)
[9]:
# available dictionary keys for words
dv_model.wv.key_to_index
[9]:
{'about': 0, 'houses': 1, 'not': 2, 'humans': 3, 'think': 4, 'tv': 5}
[10]:
# available dictionary keys for docs
dv_model.dv.key_to_index
[10]:
{'human': 0, 'thing': 1}
[11]:
# example of vector for a word
dv_model.wv['about'].shape, dv_model.wv['about'][:3]
[11]:
((100,), array([-0.001282 , -0.00032439, 0.0041051 ], dtype=float32))
[12]:
# example of vector for a doc
dv_model.dv['human'].shape, dv_model.dv['human'][:3]
[12]:
((100,), array([-0.01216876, -0.01115332, -0.01915727], dtype=float32))
[13]:
# infer similarity of a new document to known document tags
dv_model.dv.similar_by_vector(dv_model.infer_vector(["humans", "can"]))
[13]:
[('human', 0.3137548267841339), ('thing', 0.22879791259765625)]
[14]:
# infer similarity of each document in the dataset (as if they were new documents) to known document tags
print(" predict | correct")
for x in ProcessedDocs(text_corpus):
print(" ", x.tags[0], " | ", dv_model.dv.similar_by_vector(dv_model.infer_vector(x.words))[0][0])
predict | correct
human | thing
thing | thing
human | thing
thing | thing
human | human
thing | thing
thing | human
human | thing
thing | human
human | thing
human | thing
thing | thing
thing | human
thing | thing
thing | thing
thing | thing
[15]:
# infer similarity of each document in a new dataset to known document tags
new_text_corpus = [
"humans can watch TV now",
"You should not think about houses",
"There is a TV inside those houses",
"Good reasons to think",
]
print(" predict | correct")
for x in ProcessedDocs(new_text_corpus):
print(" ", x.tags[0], " | ", dv_model.dv.similar_by_vector(dv_model.infer_vector(x.words))[0][0])
predict | correct
human | thing
thing | thing
thing | thing
thing | human
4.3. Real data example with Word2Vec and LSTM
[16]:
db = torchtext.datasets.DBpedia(root='.data')
560000lines [00:24, 23132.59lines/s]
560000lines [00:43, 12879.42lines/s]
70000lines [00:04, 14107.03lines/s]
[17]:
db[0][-1]
[17]:
(13,
tensor([ 2, 2282, 6365, 25737, 2282, 6365, 25737, 6, 526,
5, 3115, 21, 2, 2282, 6365, 9, 179, 10,
7, 2785, 7137, 18, 19, 2064, 107, 8, 687,
54, 33, 91, 6, 236, 6129, 3, 7137, 33,
509, 187, 110, 696, 463, 4, 235, 3, 2786,
2, 150, 2282, 6365, 463, 652, 48, 185, 1906,
8, 12314, 524, 3, 1450, 21, 273, 177, 5,
668703, 6505, 8, 32125, 12, 2, 301, 62, 7668,
6934, 526, 792, 6, 30622, 135, 2, 43128, 2974,
23, 347, 1884, 2, 2282, 6365, 3]))
[18]:
class DBProcessor:
def __init__(self, text_train, text_test):
self.text_train = text_train
self.text_test = text_test
def __iter__(self):
for i, instance in enumerate(itertools.chain(self.text_train, self.text_test)):
sentence = instance[1]
yield list(sentence.numpy())
[19]:
vector_size = 100
pdb_train = DBProcessor(db[0], db[1])
dbp_model = gensim.models.Word2Vec(vector_size=vector_size, window=5, min_count=1, workers=4)
dbp_model.build_vocab(pdb_train, update=False)
dbp_model.train(pdb_train, total_examples=dbp_model.corpus_count, epochs=1)
[19]:
(25426807, 34419767)
[20]:
class EmbDatasetTorch(torch.utils.data.Dataset):
def __init__(self, db, wv):
self.db = db
self.wv = wv
def __getitem__(self, i):
instance = self.db[i]
doc_class = instance[0]
sentence = instance[1]
sentence_vecs = [self.wv[v.item()] for v in sentence if v.item() in self.wv.key_to_index.keys()]
return doc_class, torch.as_tensor(np.column_stack(sentence_vecs).T)
def __len__(self):
return len(self.db)
[21]:
db_train = EmbDatasetTorch(db[0], dbp_model.wv)
db_test = EmbDatasetTorch(db[1], dbp_model.wv)
[22]:
next(iter(db_train))
[22]:
(0,
tensor([[-0.6556, -0.8930, 1.3688, ..., -1.6038, -2.4898, 2.5580],
[-0.5155, -2.5213, -1.0673, ..., -1.8014, -4.9374, -1.3045],
[-1.3310, -0.9853, 2.1687, ..., -1.5520, -1.2400, 1.2778],
...,
[ 0.0188, 0.9136, -3.0507, ..., -1.5575, -3.0111, 0.6496],
[ 0.4291, -0.9791, 1.5837, ..., -0.9062, -0.5298, 1.2703],
[-0.5155, -2.5213, -1.0673, ..., -1.8014, -4.9374, -1.3045]]))
Let’s definite our Pytorch lightning dataloader
[38]:
def collate_fn(x):
labels, sentences = zip(*x)
sentences = nn.utils.rnn.pack_sequence(sentences, enforce_sorted=False)
labels = torch.LongTensor(labels)
return sentences, labels
class DataModule(pl.LightningDataModule):
def __init__(self, db_train, db_test, batch_size = 50,
num_workers=2, train_val_split_seed=0):
super().__init__()
self.batch_size = min(batch_size, len(db_train))
self.num_workers = num_workers
self.train_val_split_seed = train_val_split_seed
self.db_train = db_train
self.db_test = db_test
def setup(self, stage):
if stage == 'fit':
full_dataset = self.db_train
generator = torch.Generator().manual_seed(self.train_val_split_seed)
full_size = len(full_dataset)
val_size = min(full_size//10, 10000)
partitions = [full_size - val_size, val_size]
full_dataset = torch.utils.data.random_split(full_dataset, partitions,
generator=generator)
self.train_dataset, self.val_dataset = full_dataset
if stage == 'test':
if self.db_test is not None:
self.test_dataset = self.db_test
def train_dataloader(self):
return DataLoader(self.train_dataset, batch_size=self.batch_size, drop_last=True,
shuffle=True, num_workers=self.num_workers, collate_fn=collate_fn)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batch_size,
num_workers = self.num_workers, collate_fn=collate_fn)
def test_dataloader(self):
if self.db_test is None:
raise RuntimeError("Test data not set")
return DataLoader(self.test_dataset, batch_size=self.batch_size,
num_workers = self.num_workers, collate_fn=collate_fn)
[39]:
datamodule = DataModule(db_train, db_test, num_workers=0)
datamodule.setup('fit')
[40]:
sentences, labels = next(iter(datamodule.train_dataloader()))
4.4. Defining main classes
[31]:
class LitNN(pl.LightningModule):
def __init__(self, vector_size, n_classification_labels,
lstm_hidden_size = 150,
lstm_num_layers = 2,
lr=0.01, weight_decay=0):
super().__init__()
assert n_classification_labels != 1
self.lr = lr
self.weight_decay = weight_decay
self.n_classification_labels = n_classification_labels
input_size = vector_size
self.lstm = nn.LSTM(
input_size = vector_size,
hidden_size = lstm_hidden_size,
num_layers = lstm_num_layers,
)
self.last_layer = self._initialize_layer(nn.Linear(lstm_hidden_size, n_classification_labels))
def forward(self, x):
x = self.lstm(x)
x = x[1][0][-1]
x = self.last_layer(x)
return x
def _initialize_layer(self, layer):
nn.init.constant_(layer.bias, 0)
gain = nn.init.calculate_gain('relu')
nn.init.xavier_normal_(layer.weight, gain=gain)
return layer
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
return optimizer
def training_step(self, train_batch, batch_idx):
inputv, target = train_batch
output = self.forward(inputv)
if self.n_classification_labels:
loss = F.cross_entropy(output, target)
self.log('train_loss_ce', loss.item())
else:
loss = F.mse_loss(output, target)
self.log('train_loss_rmse', np.sqrt(loss.item()))
return loss
def test_validation_step(self, batch, batch_idx, name):
inputv, target = batch
output = self.forward(inputv)
if self.n_classification_labels:
loss_ce = F.cross_entropy(output, target).item()
loss_zo = (torch.argmax(output, 1) != target)+0.
loss_zo = loss_zo.mean().item()
self.log(f'{name}_loss_ce', loss_ce)
self.log(f'{name}_loss_zo', loss_zo)
else:
loss_mse = F.mse_loss(output, target).item()
loss_mae = F.l1_loss(output, target).item()
self.log(f'{name}_loss_rmse', np.sqrt(loss_mse))
self.log(f'{name}_loss_mae', loss_mae)
def validation_step(self, val_batch, batch_idx):
self.test_validation_step(val_batch, batch_idx, 'val')
def test_step(self, test_batch, batch_idx):
self.test_validation_step(test_batch, batch_idx, 'test')
def predict_step(self, predict_batch, batch_idx, dataloader_idx):
inputv, target = predict_batch
output = self.forward(inputv)
return output
[42]:
datamodule = DataModule(db_train, db_test, batch_size=2048)
smodel = LitNN(vector_size=vector_size, n_classification_labels=14)
early_stop_callback = EarlyStopping(
monitor='val_loss_ce',
min_delta=0.00,
patience=10,
verbose=False,
mode='min'
)
# use MLFlow as logger if available, see other options at
# https://pytorch-lightning.readthedocs.io/en/latest/common/loggers.html
# you can start MLFLow server with:
# mlflow server --backend-store-uri=./mlruns
try:
from pytorch_lightning.loggers import MLFlowLogger
logger = MLFlowLogger(
experiment_name="Default",
tracking_uri="file:./mlruns"
)
except ImportError:
# default: Tensorboard, you can start with:
# tensorboard --logdir lightning_logs
logger = True
trainer = pl.Trainer(
precision=32,
gpus=torch.cuda.device_count(),
#tpu_cores=None,
logger=logger,
val_check_interval=0.1, # do validation check 10 times for each epoch
#auto_scale_batch_size=True,
auto_lr_find=True,
callbacks=early_stop_callback,
max_epochs = 100,
)
GPU available: True, used: True
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
TPU available: False, using: 0 TPU cores
[43]:
trainer.tune(smodel, datamodule = datamodule)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
--------------------------------------
0 | lstm | LSTM | 332 K
1 | last_layer | Linear | 2.1 K
--------------------------------------
334 K Trainable params
0 Non-trainable params
334 K Total params
1.338 Total estimated model params size (MB)
| Name | Type | Params
--------------------------------------
0 | lstm | LSTM | 332 K
1 | last_layer | Linear | 2.1 K
--------------------------------------
334 K Trainable params
0 Non-trainable params
334 K Total params
1.338 Total estimated model params size (MB)
/home/marco/.local/lib/python3.8/site-packages/pytorch_lightning/utilities/distributed.py:69: UserWarning: The dataloader, val dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
warnings.warn(*args, **kwargs)
/home/marco/.local/lib/python3.8/site-packages/pytorch_lightning/utilities/distributed.py:69: UserWarning: The dataloader, train dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
warnings.warn(*args, **kwargs)
--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{}
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{}
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{}
--------------------------------------------------------------------------------
Restored states from the checkpoint file at /home/marco/Documents/projects/python-intro/sections/lr_find_temp_model.ckpt
Restored states from the checkpoint file at /home/marco/Documents/projects/python-intro/sections/lr_find_temp_model.ckpt
Learning rate set to 0.19054607179632482
Learning rate set to 0.19054607179632482
[43]:
{'lr_find': <pytorch_lightning.tuner.lr_finder._LRFinder at 0x7f1bfae19400>}
[44]:
# fit smodel
trainer.fit(smodel, datamodule = datamodule)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
--------------------------------------
0 | lstm | LSTM | 332 K
1 | last_layer | Linear | 2.1 K
--------------------------------------
334 K Trainable params
0 Non-trainable params
334 K Total params
1.338 Total estimated model params size (MB)
| Name | Type | Params
--------------------------------------
0 | lstm | LSTM | 332 K
1 | last_layer | Linear | 2.1 K
--------------------------------------
334 K Trainable params
0 Non-trainable params
334 K Total params
1.338 Total estimated model params size (MB)
[45]:
trainer.test(smodel, datamodule = datamodule)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/marco/.local/lib/python3.8/site-packages/pytorch_lightning/utilities/distributed.py:69: UserWarning: The dataloader, test dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
warnings.warn(*args, **kwargs)
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss_ce': 0.5150100588798523, 'test_loss_zo': 0.1482437252998352}
--------------------------------------------------------------------------------
[45]:
[{'test_loss_ce': 0.5150100588798523, 'test_loss_zo': 0.1482437252998352}]
[46]:
# predict smodel
data_loader = torch.utils.data.Subset(db_train, range(10))
data_loader = DataLoader(data_loader, collate_fn=collate_fn)
test_pred = trainer.predict(smodel, data_loader)
test_pred = [F.softmax(t, 1).cpu() for t in test_pred]
test_pred = np.vstack(test_pred)
test_pred
/home/marco/.local/lib/python3.8/site-packages/pytorch_lightning/utilities/distributed.py:69: DeprecationWarning: The `LightningModule.datamodule` property is deprecated in v1.3 and will be removed in v1.5. Access the datamodule through using `self.trainer.datamodule` instead.
warnings.warn(*args, **kwargs)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/marco/.local/lib/python3.8/site-packages/pytorch_lightning/utilities/distributed.py:69: UserWarning: The dataloader, predict dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
warnings.warn(*args, **kwargs)
[46]:
array([[4.09552157e-01, 1.06071964e-01, 8.41988176e-02, 2.59270594e-02,
3.22150514e-02, 5.33441489e-04, 5.33908457e-02, 3.97227984e-03,
1.80906039e-02, 3.13700511e-05, 7.64723518e-04, 9.98726246e-05,
4.46949855e-07, 2.65151352e-01],
[9.75617692e-02, 1.64559879e-07, 4.24981059e-04, 7.47480681e-06,
6.20739684e-06, 5.22396527e-03, 1.70907925e-03, 2.12730301e-04,
1.85641170e-06, 1.91530702e-03, 5.31282611e-02, 1.37102649e-01,
3.89551185e-02, 6.63750350e-01],
[8.68686259e-01, 1.33589652e-04, 1.16329047e-05, 1.16774821e-08,
1.95751545e-06, 2.35673203e-03, 5.91394305e-02, 3.88082638e-02,
3.63222134e-05, 7.82081827e-07, 1.42352932e-04, 2.43386975e-03,
1.07618331e-10, 2.82487869e-02],
[7.53526568e-01, 2.33273354e-07, 2.88229767e-05, 5.08333073e-07,
4.38038005e-05, 4.07082103e-02, 4.40885797e-02, 9.62672383e-02,
1.27165418e-04, 1.53450109e-03, 1.51878968e-02, 4.36797366e-03,
7.46173373e-06, 4.41109948e-02],
[2.87303030e-01, 1.16851628e-02, 1.08578301e-04, 3.28232352e-09,
6.70920999e-05, 6.95538940e-03, 6.83973789e-01, 5.42083988e-04,
3.53601922e-06, 3.26784333e-09, 5.43062072e-07, 7.72331332e-05,
3.48732050e-11, 9.28363111e-03],
[6.40867278e-02, 1.21500306e-01, 9.95871029e-04, 5.26127842e-05,
3.09540446e-05, 3.12927623e-05, 4.73763747e-03, 5.42093294e-05,
1.59608258e-04, 2.08304823e-06, 3.71216920e-05, 4.27920138e-04,
5.87089471e-06, 8.07877779e-01],
[4.58942205e-01, 5.02900593e-06, 4.93392348e-04, 3.42837564e-04,
2.99833203e-03, 2.20704935e-02, 4.25901487e-02, 2.71155648e-02,
4.55437973e-03, 6.51715994e-02, 3.30148749e-02, 7.66062527e-04,
3.78116369e-02, 3.04123491e-01],
[3.61741275e-01, 2.80804306e-05, 2.67065698e-05, 3.83088200e-10,
2.48303586e-05, 5.98860420e-02, 5.72525144e-01, 2.20764731e-03,
6.49348181e-07, 2.63451909e-08, 5.55272118e-06, 3.56301782e-04,
9.63145605e-11, 3.19772563e-03],
[8.64959776e-01, 1.35151431e-05, 5.78636946e-06, 3.87738730e-09,
1.05855952e-06, 4.55121743e-03, 5.02642728e-02, 5.95776401e-02,
1.58676849e-05, 1.23718848e-06, 2.75576807e-04, 3.97440372e-03,
9.71813185e-11, 1.63596980e-02],
[2.42577583e-01, 4.70198188e-07, 9.49133828e-05, 3.02213117e-08,
6.46146887e-04, 3.61857921e-01, 3.85332048e-01, 2.62155617e-03,
3.56351916e-06, 1.71235315e-05, 1.89578263e-04, 3.00951971e-04,
3.18828302e-06, 6.35486608e-03]], dtype=float32)
[47]:
smodel.trainer.callback_metrics
[47]:
{'val_loss_ce': tensor(0.4905),
'val_loss_zo': tensor(0.1475),
'train_loss_ce': tensor(0.5414, device='cuda:0'),
'test_loss_ce': tensor(0.5150),
'test_loss_zo': tensor(0.1482)}
[48]:
# check if smodel if is pickable
pickle.dumps(smodel);