This blog post compares 2 methods for generating vectorisations of text by training a classification model to predict who said a sentence from a given transcript.
For this project I use the output of Amazon's Transcribe module after we sent it recordings of the Journal Club podcast. The corpus is made up of 7 different transcriptions, these are split into 'documents' or sentences each spoken by Kyle, Lan or myself.
Corpus : the combined documents from the transcripts
Document : a sentence uttered by one speaker, composed of terms
Term : an individual word or token
import pandas as pd
urls = ["csvs/adversarial-examples-protein-folding-and-shaley-values.csv",
"csvs/dark-secrets-of-bert-radioactive-data-and-vanishing-gradients.csv",
"csvs/ep1_raw.csv",
"csvs/intro-narration-raw.csv",
"csvs/JC-EP-12.csv",
"csvs/JC-Ep2-DS-Page.csv"]
data = pd.read_csv(urls[0])
for url in urls[1:]:
data_m = pd.read_csv(url)
data = pd.concat([data, data_m])
data.head()
TF-IDF stands for term frequency-inverse document frequency. It allows us to give each term in a document a score that reflects its importance, taking into account how frequently it appears in the entire corpus. It is widely used in text-based recommender systems. A high TF-IDF score means that a term is either very frequent in our document, or very infrequent in the corpus.
We're using the gensim library to generate our model here.
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
import numpy as np
from collections import defaultdict
corpus = data.loc[:,'0']
#process each sentence into tokens
texts = [[word for word in simple_preprocess(document)]
for document in corpus]
print(texts[0])
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
print(len(processed_corpus))
# Generate integer representations of each word
dictionary = corpora.Dictionary(processed_corpus)
# Generate Bag-Of-Words representation of each document
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
# Train our tfidf model on the corpus
tfidf = models.TfidfModel(bow_corpus)
# Example sentence
test_sentence = "Hi, my name is George"
print("Example String: \n '{}'\n".format(test_sentence))
query_document = simple_preprocess(test_sentence)
print("Preprocess to tokens: \n {}\n".format(query_document))
query_bow = dictionary.doc2bow(query_document)
print("Turn into Bag-of-Words representation: \n {}\n".format(query_bow))
vectorized = tfidf[query_bow]
print("Model output! Sparse tf-idf representation: \n {}".format(vectorized))
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=len(dictionary.dfs))
sims = index[vectorized]
print("==TESTING DOCUMENT SIMILARITY==")
print("Test sequence: '{}'\n".format(test_sentence))
count = 0
print("RANK | Speaker: sentence\n")
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
print("{} | {}: '{}'".format(count, data.loc[:,'1'].iloc[document_number], corpus.iloc[document_number]))
print(" score = {:.3f}\n".format(score))
count += 1
if count == 10: break
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import Sequence, to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
tfidf_model = Sequential(
[
Input(shape=(2084,)),
Dense(200, activation="relu", name="layer1"),
Dense(3, name="output", activation="softmax"),
]
)
tfidf_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
class tfidf_generator(Sequence):
def __init__(self, vectorizer, dictionary, documents, labels, shape=2084):
self.vectorizer = vectorizer
self.documents = np.asarray(documents)
self.labels = np.asarray(labels)
self.name_dict = {"Kyle" : 0, "Lan" : 1, "George" : 2}
self.shape = shape
self.dictionary = dictionary
self.on_epoch_end()
def __len__(self):
return int(np.floor(len(self.documents)))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
i = self.indexes[index]
# Find list of documents, labels
doc = self.documents[i]
label = self.labels[i]
# Generate data
X = np.zeros((1, self.shape))
y = np.zeros((1), dtype=int)
query_doc = simple_preprocess(doc)
query_bow = self.dictionary.doc2bow(query_doc)
sparse_rep = self.vectorizer[query_bow]
for j, s in sparse_rep:
X[0,j] = s
y[0] = self.name_dict[label]
return X, to_categorical(y, num_classes=3)
def get_y_true(self):
y = [self.name_dict[self.labels[i]] for i in self.indexes]
return y #to_categorical(y, num_classes=3)
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.documents))
np.random.shuffle(self.indexes)
def cm(y_true, y_pred, names=None):
'''
Prints a confusion matrix heatmap from input y_true, y_pred arrays
y axis: true
x axis: prediction
'''
if names==None: names = ["Kyle", "Lan", "George"]
print("Test accuracy: {:.2f}".format(accuracy_score(y_true, y_pred)))
cm = confusion_matrix(y_true, y_pred, normalize='true')
sn.heatmap(cm, square=True, xticklabels=names, yticklabels=names, annot=True)
plt.ylabel("True")
plt.xlabel("Predicted")
plt.show()
X_train, X_test, y_train, y_test = train_test_split(data.loc[:,'0'],
data.loc[:,'1'],
test_size=0.20, random_state=0)
# set up data generators
train_generator = tfidf_generator(tfidf, dictionary, X_train, y_train)
test_generator = tfidf_generator(tfidf, dictionary, X_test, y_test)
# train the model
tfidf_model.fit(train_generator)
# evaluate the model
y_pred = tfidf_model.predict(test_generator)
cm(test_generator.get_y_true(), np.argmax(y_pred, axis=1))
The result here is ok, it's better than 1/3 random chance and so at ~65% it's halfway there(?). It seems to be favouring Kyle in the confusion matrix, so let's try balance the data.
data['1'].value_counts()
def balanced_sample_maker(X, y, sample_size, random_seed=None):
""" return a balanced data set by sampling all classes with sample_size
current version is developed on assumption that the positive
class is the minority.
Author: Kevin Mader on StackOverflow
Parameters:
===========
X: {numpy.ndarrray}
y: {numpy.ndarray}
"""
uniq_levels = np.unique(y)
uniq_counts = {level: sum(y == level) for level in uniq_levels}
if not random_seed is None:
np.random.seed(random_seed)
# find observation index of each class levels
groupby_levels = {}
for ii, level in enumerate(uniq_levels):
obs_idx = [idx for idx, val in enumerate(y) if val == level]
groupby_levels[level] = obs_idx
# oversampling on observations of each label
balanced_copy_idx = []
for gb_level, gb_idx in groupby_levels.items():
over_sample_idx = np.random.choice(gb_idx, size=sample_size, replace=True).tolist()
balanced_copy_idx+=over_sample_idx
np.random.shuffle(balanced_copy_idx)
return X.iloc[balanced_copy_idx], y.iloc[balanced_copy_idx], balanced_copy_idx
balanced_tfidf_model = Sequential(
[
Input(shape=(2084,)),
Dense(200, activation="relu", name="layer1"),
Dense(3, name="output", activation="softmax"),
]
)
balanced_tfidf_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
X, y, _ = balanced_sample_maker(data.loc[:,'0'], data.loc[:,'1'], 261)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
# set up data generators
train_generator = tfidf_generator(tfidf, dictionary, X_train, y_train)
test_generator = tfidf_generator(tfidf, dictionary, X_test, y_test)
# train the model
balanced_tfidf_model.fit(train_generator, epochs=2)
# evaluate the model
y_pred = balanced_tfidf_model.predict(test_generator)
cm(test_generator.get_y_true(), np.argmax(y_pred, axis=1))
That's an improvement! Far fewer false-positives.
Now lets try something more sophisticated:
bert_urls = ["bert vectors/adversarial-examples-protein-folding-and-shaley-values_bert.csv",
"bert vectors/dark-secrets-of-bert-radioactive-data-and-vanishing-gradients_bert.csv",
"bert vectors/ep1_bert.csv",
"bert vectors/intro-narration-raw_bert.csv",
"bert vectors/JC-EP-12_bert.csv",
"bert vectors/JC-Ep2-DS-Page_bert.csv"]
data = pd.read_csv(bert_urls[0])
for url in bert_urls[1:]:
data_m = pd.read_csv(url)
data = pd.concat([data, data_m])
Using the BERT Language Model I generated vector representations of our entire corpus.
data.head()
bert_model = Sequential(
[
Input(shape=(768,)),
Dense(200, name="layer1", activation="relu"),
Dense(3, name="output", activation="softmax"),
]
)
bert_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
tidy_data = pd.get_dummies(data)
X_train, X_test, y_train, y_test = train_test_split(tidy_data.loc[:,'0':'767'],
tidy_data.loc[:,'label_George':'label_Lan'],
test_size=0.10, random_state=0,
stratify=tidy_data.loc[:,'label_George':'label_Lan'])
history = bert_model.fit(X_train, y_train, epochs=25, verbose=0)
plt.plot(history.history['loss'])
plt.title("Loss")
plt.show()
y_pred = bert_model.predict(X_test)
cm(np.argmax(np.asarray(y_test), axis=1), np.argmax(y_pred, axis=1), names=["George", "Kyle", "Lan"])
balanced_bert_model = Sequential(
[
Input(shape=(768,)),
Dense(200, name="layer1", activation="relu"),
Dense(3, name="output", activation="softmax"),
]
)
balanced_bert_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
X, y, _ = balanced_sample_maker(data.loc[:,'0':'767'],
data.loc[:,'label'], 261)
X_train, X_test, y_train, y_test = train_test_split(X, pd.get_dummies(y),
test_size=0.10, random_state=0, stratify=y)
history = balanced_bert_model.fit(X_train, y_train, epochs=25, verbose=0)
plt.plot(history.history['loss'])
plt.title("Loss")
plt.show()
y_pred = balanced_bert_model.predict(X_test)
cm(np.argmax(np.asarray(y_test), axis=1), np.argmax(y_pred, axis=1), names=["George", "Kyle", "Lan"])
url = "csvs/tools-for-misusing-gpt-2-tensorflow-and-ml-unfairness.csv"
bert_url = "bert vectors/tools-for-misusing-gpt-2-tensorflow-and-ml-unfairness_bert.csv"
tfidf_data = pd.read_csv(url)
# set up data generator
target_generator = tfidf_generator(tfidf, dictionary, tfidf_data.loc[:,'0'], tfidf_data.loc[:,'1'])
# evaluate the tfidf model
print("UNBALANCED TF-IDF MODEL")
y_pred = tfidf_model.predict(target_generator)
cm(target_generator.get_y_true(), np.argmax(y_pred, axis=1))
# evaluate the balanced tfidf model
print("BALANCED TF-IDF MODEL")
y_pred = balanced_tfidf_model.predict(target_generator)
cm(target_generator.get_y_true(), np.argmax(y_pred, axis=1))
bert_data = pd.read_csv(bert_url)
tidy_bert_data = pd.get_dummies(bert_data)
X_train, X_test, y_train, y_test = train_test_split(tidy_data.loc[:,'0':'767'],
tidy_data.loc[:,'label_George':'label_Lan'],
test_size=0.20, random_state=0)
# evaluate the bert model
print("UNBALANCED BERT MODEL")
y_pred = bert_model.predict(tidy_bert_data.loc[:,'0':'767'])
cm(np.argmax(np.asarray(tidy_bert_data.loc[:,'label_George':'label_Lan']), axis=1),
np.argmax(y_pred, axis=1), names=["George", "Kyle", "Lan"])
# evaluate the balanced bert model
print("BALANCED BERT MODEL")
y_pred = balanced_bert_model.predict(tidy_bert_data.loc[:,'0':'767'])
cm(np.argmax(np.asarray(tidy_bert_data.loc[:,'label_George':'label_Lan']), axis=1),
np.argmax(y_pred, axis=1), names=["George", "Kyle", "Lan"])