Hey team, I'm running a little late on my blog post... here's a work in progress in which I pull pages from Wikipedia about trees and about birds and see if LIME can explain my random forest model.
from urllib.request import urlopen
from urllib.parse import urlencode
from json import loads
import os
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import random
import lime
import numpy as np
import sklearn
import sklearn.feature_extraction as fe
import sklearn.ensemble
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
os.mkdir('tmp')
birds_raw = requests.get('https://en.wikipedia.org/wiki/List_of_birds_of_the_United_States').content.decode('utf-8')
trees_raw = requests.get('https://en.wikipedia.org/wiki/List_of_U.S._state_and_territory_trees').content.decode('utf-8')
def build_corpus(content):
pages = []
soup = BeautifulSoup(content, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href')
if href is not None and href.startswith('/wiki/') and not(href.endswith('.jpg')) and href.find(':') == -1 and not(href.startswith('/wiki/List_')):
fn = 'tmp/' + href[6:] + '.html'
if not(os.path.exists(fn)):
print(fn)
content = requests.get(f'https://en.wikipedia.org{href}').content.decode('utf-8')
f = open(fn, 'w')
f.write(content)
f.close()
pages.append(fn)
return pages
birds = build_corpus(birds_raw)
trees = build_corpus(trees_raw)
from html.parser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.strict = False
self.convert_charrefs= True
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
docs = []
for fn in birds:
f = open(fn, 'r')
content = f.read()
f.close()
text = strip_tags(content)
docs.append({"label": 'birds', "doc": text})
for fn in trees:
f = open(fn, 'r')
content = f.read()
f.close()
text = strip_tags(content)
docs.append({"label": 'trees', "doc": text})
len(docs)
df = pd.DataFrame(docs)
df = df.sample(frac=1).reset_index(drop=True)
df.head()
s = set(df['label'])
label_map = dict(zip(s, np.arange(len(s))))
df['label_id'] = df['label'].apply(lambda x: label_map[x])
train_proportion = 0.8
p = int(df.shape[0] * train_proportion)
vectorizer = fe.text.TfidfVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(df[0:p]['doc'])
test_vectors = vectorizer.transform(df[p:]['doc'])
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500)
rf.fit(train_vectors, df[0:p]['label_id'].tolist())
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
pred = rf.predict(test_vectors)
sklearn.metrics.f1_score(df[p:]['label_id'].tolist(), pred, average='binary')
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, rf)
predictions = c.predict_proba(df[p:]['doc'])
preds = []
for i, pred in enumerate(predictions):
preds.append((df.index[p+i], pred[0]))
df2 = pd.DataFrame(preds, columns=['row_number', 'prediction'])
df2['surprise'] = df2['prediction'].apply(lambda x: 1 - abs(0.5 - x))
df2.sort_values('surprise', inplace=True, ascending=False)
df2.index = np.arange(0, df2.shape[0])
_ = df2['surprise'].plot()
df2.head()
from lime.lime_text import LimeTextExplainer
d = {}
for i, k in enumerate(label_map.keys()):
v = label_map[k]
d[v] = k
explainer = LimeTextExplainer(class_names=d)
idx = 1642
exp = explainer.explain_instance(df.iloc[idx]['doc'], c.predict_proba, num_features=6)
print('Document id: %d' % idx)
x = c.predict_proba([df.iloc[idx]['doc']])
print(x)
print('Probability =', x)
print('True class: %s' % df.iloc[idx]['label'])
exp.as_list()
fig = exp.as_pyplot_figure()
exp.class_names = label_map
exp.show_in_notebook(text=True)
df2.tail()
idx = 1740
exp = explainer.explain_instance(df.iloc[idx]['doc'], c.predict_proba, num_features=6)
print('Document id: %d' % idx)
x = c.predict_proba([df.iloc[idx]['doc']])
print(x)
print('Probability =', x)
print('True class: %s' % df.iloc[idx]['label'])
exp.as_list()
fig = exp.as_pyplot_figure()
exp.class_names = label_map
exp.show_in_notebook(text=True)