The Guardian News Classification Using Naive Bayes Classifier¶

In this project, Naive Bayes was used to classified Guardian articles. The articles have six labels are business, culture, environment, life-and-style, sport, technology. We will apply the Naive Bayes method to our sample articles for testing.

#!pip install wordcloud

import requests
import json
import time
import pandas as pd
import numpy as np
import re
import os
import wordcloud 
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.metrics import classification_report,confusion_matrix
import random as rd
import seaborn as sn

url = "https://content.guardianapis.com/search?section=%s&show-fields=body&page=%s&page-size=200&api-key="

key = "" #guardian api key
url = url + key
types=["technology","lifeandstyle","business","sport","culture","environment"]
filename = "news_database.csv"
totalpage = 10 #1Page contains 200 articles
#scrap(type_)

def get_articles(types_):
    if filename in os.listdir("."):
        df_ = pd.read_csv(filename)
        print("file found")
    else:
        print("creating article dataframe...")
        textdata =[]
        label = []
        for typ in types_:
            for k in range(1,totalpage+1): 
                u1 = url % (typ,str(k))
                data = requests.get(u1)
                parsed_data = json.loads(data.text)
                if (parsed_data["response"]["status"] == "ok"):
                    for art in parsed_data["response"]["results"]:
                        textdata.append(art["fields"]["body"])
                        label.append(typ)
                print(len(label))       
                time.sleep(2)
        df_ = pd.DataFrame({'Text': textdata,'Label': label,})
        df_.to_csv(filename, encoding='utf-8')
    return df_

df =  get_articles(types)

file found

#Temizlik bolumu
df = df.replace("(<figure.*?</figure>)",' ', regex=True)
df = df.replace("(<a\sh.*?</a>)",' ', regex=True)
df = df.replace('<[^>]*>', ' ', regex=True)
df = df.replace('â€.', ' ', regex=True)
df = df.replace("[^a-zA-Z]"," ", regex=True)
for i in range(5):
    df = df.replace("\s(\w{1,3})\s",' ', regex=True)

df.Label = pd.Categorical(df.Label)
df['Category'] = df.Label.cat.codes
df.Label.value_counts()

technology      2000
sport           2000
lifeandstyle    2000
environment     2000
culture         2000
business        2000
Name: Label, dtype: int64

if 'filename.csv' not in os.listdir("."):
    df.to_csv("filename.csv", encoding='utf-8')
df.head()

df['Text']=df['Text'].str.lower()
df['words']=df['Text'].str.split()
df.head()

Split Data to Train and Test¶

#Split Test and Training
np.random.RandomState(42)
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.3)

train=train.reset_index(drop=True)
test=test.reset_index(drop=True)

train.head()

test.head()

test=test.iloc[0:500]

print('Number of Emails')
print ("Train_business      :{}".format(len(train[train.Category==0])))
print ("Train_culture       :{}".format(len(train[train.Category==1])))
print ("Train_environment   :{}".format(len(train[train.Category==2])))
print ("Train_lifeandstyle  :{}".format(len(train[train.Category==3])))
print ("Train_sport         :{}".format(len(train[train.Category==4])))
print ("Train_technology    :{}".format(len(train[train.Category==5])))

print ("Test_business       :{}".format(len(test[test.Category==0])))
print ("Test_culture        :{}".format(len(test[test.Category==1])))
print ("Test_environment    :{}".format(len(test[test.Category==2])))
print ("Test_lifeandstyle   :{}".format(len(test[test.Category==3])))
print ("Test_sport          :{}".format(len(test[test.Category==4])))
print ("Test_technology     :{}".format(len(test[test.Category==5])))

print ("Total               :{}".format(len(df)))

Number of Emails
Train_business      :1400
Train_culture       :1377
Train_environment   :1392
Train_lifeandstyle  :1409
Train_sport         :1418
Train_technology    :1404
Test_business       :73
Test_culture        :78
Test_environment    :101
Test_lifeandstyle   :89
Test_sport          :79
Test_technology     :80
Total               :12000

Training Data¶

The following table is made to show the frequency of a token appear in the business, culture, environment, life-and-style, sport, technology groups, the appearances of each token count only once for each articles.

WORDS = []
for cell in train.words:
    for word in cell:
        WORDS.append(word)
words=pd.DataFrame(WORDS, columns=['word'])
words['count']=1
Words=words.groupby('word', as_index=False).sum().iloc[:,0:2]
#Words=Words[Words['count']>10]
Words.head()

Inverse document frequency (IDF)¶

The IDF was used to weigh a keyword in any content and assign the importance to that keyword based on the number of times it appears in the document.

Words['idf']=np.log(len(train)/Words['count'])

Words.head()

businesswords = []
for cell in train[train.Category==0].words:
    for word in cell:
        businesswords.append(word)
words=pd.DataFrame(businesswords, columns=['word'])
words['Appearances in business']=1
businesswords=words.groupby('word', as_index=False).sum().iloc[:,0:2]
#businesswords=businesswords[businesswords['Appearances in business']>5]
businesswords.head()

culturewords = []
for cell in train[train.Category==1].words:
    for word in cell:
        culturewords.append(word)
words=pd.DataFrame(culturewords, columns=['word'])
words['Appearances in culture']=1
culturewords=words.groupby('word', as_index=False).sum().iloc[:,0:2]
#culturewords=culturewords[culturewords['Appearances in culture']>5]
culturewords.head()

environmentwords = []
for cell in train[train.Category==2].words:
    for word in cell:
        environmentwords.append(word)
words=pd.DataFrame(environmentwords, columns=['word'])
words['Appearances in environment']=1
environmentwords=words.groupby('word', as_index=False).sum().iloc[:,0:2]
#environmentwords=environmentwords[environmentwords['Appearances in environment']>5]
environmentwords.head()

lifeandstylewords = []
for cell in train[train.Category==3].words:
    for word in cell:
        lifeandstylewords.append(word)
words=pd.DataFrame(lifeandstylewords, columns=['word'])
words['Appearances in lifeandstyle']=1
lifeandstylewords=words.groupby('word', as_index=False).sum().iloc[:,0:2]
#lifeandstylewords=lifeandstylewords[lifeandstylewords['Appearances in lifeandstyle']>5]
lifeandstylewords.head()

sportwords = []
for cell in train[train.Category==4].words:
    for word in cell:
        sportwords.append(word)
words=pd.DataFrame(sportwords, columns=['word'])
words['Appearances in sport']=1
sportwords=words.groupby('word', as_index=False).sum().iloc[:,0:2]
#sportwords=sportwords[sportwords['Appearances in sport']>5]
sportwords.head()

technologywords = []
for cell in train[train.Category==5].words:
    for word in cell:
        technologywords.append(word)
words=pd.DataFrame(technologywords, columns=['word'])
words['Appearances in technology']=1
technologywords=words.groupby('word', as_index=False).sum().iloc[:,0:2]
#technologywords=technologywords[technologywords['Appearances in technology']>5]
technologywords.head()

merged1 = pd.merge(Words,businesswords, on='word', how='outer')
merged2=pd.merge(merged1,culturewords, on='word', how='outer')
merged3=pd.merge(merged2,environmentwords, on='word', how='outer')
merged4=pd.merge(merged3,lifeandstylewords, on='word', how='outer')
merged5=pd.merge(merged4,sportwords, on='word', how='outer')
merged6=pd.merge(merged5,technologywords, on='word', how='outer')
Words=merged6.fillna(0)
Words.head()

Words=Words[Words.idf>0]

Words=Words[Words['count']>10]

Classification Process¶

Firstly, we break the arrticles we want to classify into a group of individual words w1, . . . , wn. The probability of receiving the email E is equal to the probability of receiving the list of words w1, . . . , wn.

$$P(Article) = P(w_1, . . . , w_n)$$

$$P(Article)=\prod_{i=0}^n P(w_i)$$

$$P(Articles|Buss) = P(w_1, . . . , w_n|Buss)$$

$$P(Articles|Buss)=\prod_{i=0}^n P(w_i|Buss)$$

We will apply same method for each article for training

We will calculate the probability of each word for being a business, culture, environment, life-and-style, sport or technology by using training dataset.

This estimation of P for each word in the articles could be problematic since it would give us probability 0 for documents with unknown words. To solve that problem we will use the Laplace Smoothing by adding 1/len(unique words)

Words['P(W|Buss)']=(Words['Appearances in business'])/(businesswords['Appearances in business'].sum())+0.1/Words['count'].sum()
Words['P(W|Cult)']=(Words['Appearances in culture'])/(culturewords['Appearances in culture'].sum())+0.1/Words['count'].sum()
Words['P(W|Env)']=(Words['Appearances in environment'])/(environmentwords['Appearances in environment'].sum())+0.1/Words['count'].sum()
Words['P(W|LifStd)']=(Words['Appearances in lifeandstyle'])/(lifeandstylewords['Appearances in lifeandstyle'].sum())+0.1/Words['count'].sum()
Words['P(W|Sport)']=(Words['Appearances in sport'])/(sportwords['Appearances in sport'].sum())+0.1/Words['count'].sum()
Words['P(W|Tech)']=(Words['Appearances in technology'])/(technologywords['Appearances in technology'].sum())+0.1/Words['count'].sum()
Words.head()

import time
start2 = time.time()
arr = ['P(W|Buss)','P(W|Cult)', 'P(W|Env)','P(W|LifStd)', 'P(W|Sport)', 'P(W|Tech)']
arr2 = ['log(P(wi|Buss))','log(P(wi|Cult))', 'log(P(wi|Env))','log(P(wi|LifStd))', 'log(P(wi|Sport))', 'log(P(wi|Tech))']
for p,p2 in zip(arr,arr2):
    Y=[]
    for i in range (0, len(test)):
        Y.append((pd.DataFrame(test.words[i], columns=['word'])))  
    A=[]
    for i in range(len(test)):
        A.append([np.array(Words[Words.word==wrds][p]) for wrds in Y[i].word])
    ZZ=[]
    for i in range (0, len(test)):
        ZZ.append(np.log(pd.DataFrame(A[i]).iloc[1:,]).dropna().sum())
    test[p2]=pd.DataFrame(ZZ)
end2 = time.time()
print("time (s) : {:.4f}".format(end2 - start2))

time (s) : 6528.4144

test.head()

test['max']=test.iloc[:,5:11].max(axis=1)
test=test.dropna()

test=test[test['max']<0]

test=test.reset_index(drop=True)

label=[]
for i in range (0, len(test)):
        if test['log(P(wi|Buss))'][i]==test['max'][i]:
            label.append(0)
        if test['log(P(wi|Cult))'][i]==test['max'][i]:
            label.append(1)
        if test['log(P(wi|Env))'][i]==test['max'][i]:
            label.append(2)
        if test['log(P(wi|LifStd))'][i]==test['max'][i]:
            label.append(3)
        if test['log(P(wi|Sport))'][i]==test['max'][i]:
            label.append(4)
        if test['log(P(wi|Tech))'][i]==test['max'][i]:
            label.append(5)
test['predict_label']=label

array = confusion_matrix(test['Category'], test['predict_label'])
df_cm = pd.DataFrame(array, range(6),range(6))
(df_cm)
plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)#for label size
sn.heatmap(df_cm, annot=True,annot_kws={"size": 25}, fmt="d")# font size
print ("Accuracy : %{:.4f}".format(((array[0][0]+array[1][1]+array[2][2]+array[3][3]+array[4][4]+array[5][5])/len(test))*100))

Accuracy : %89.2929

print(classification_report(test['Category'], test['predict_label']))

             precision    recall  f1-score   support

          0       0.91      0.84      0.87        73
          1       0.79      0.92      0.85        75
          2       0.91      0.87      0.89       100
          3       0.81      0.83      0.82        88
          4       1.00      0.96      0.98        79
          5       0.96      0.95      0.96        80

avg / total       0.90      0.89      0.89       495


Published: November 20, 2018

	Unnamed: 0	Text	Label	Category	words
0	0	world bank reassuring message those fearful b...	technology	5	[world, bank, reassuring, message, those, fear...
1	1	ryker price looking like trans...	technology	5	[ryker, price, looking, like, transformer, cro...
2	2	when story told historians hard pressed...	technology	5	[when, story, told, historians, hard, pressed,...
3	3	this march facebook coming under global scru...	technology	5	[this, march, facebook, coming, under, global,...
4	4	facebook accounts were affected data brea...	technology	5	[facebook, accounts, were, affected, data, bre...

	Unnamed: 0	Text	Label	Category	words
0	3055	descended house recently throwing family lif...	lifeandstyle	3	[descended, house, recently, throwing, family,...
1	1648	free wifi services offered hotels airports ...	technology	5	[free, wifi, services, offered, hotels, airpor...
2	11123	pollution inequality between world rich poor...	environment	2	[pollution, inequality, between, world, rich, ...
3	1984	british political party will victim hack simi...	technology	5	[british, political, party, will, victim, hack...
4	3499	them everywhere kitted form hugging lycra c...	lifeandstyle	3	[them, everywhere, kitted, form, hugging, lycr...

	Unnamed: 0	Text	Label	Category	words
0	4400	mark carney offer stay sound like desperate a...	business	0	[mark, carney, offer, stay, sound, like, despe...
1	10926	romania security forces have mounted series r...	environment	2	[romania, security, forces, have, mounted, ser...
2	3377	countryside patrick barkham does ...	lifeandstyle	3	[countryside, patrick, barkham, does, spring, ...
3	4803	name blockbuster from beforetimes ...	business	0	[name, blockbuster, from, beforetimes, appeara...
4	11609	michael gove become embroiled twitter with ov...	environment	2	[michael, gove, become, embroiled, twitter, wi...

	word	count
0	aaaaaaaaaa	1
1	aaaaaaaaaarghhh	1
2	aaaaaaargh	1
3	aaaaaas	1
4	aaaaand	1

	word	count	idf
0	aaaaaaaaaa	1	9.035987
1	aaaaaaaaaarghhh	1	9.035987
2	aaaaaaargh	1	9.035987
3	aaaaaas	1	9.035987
4	aaaaand	1	9.035987

	word	count	idf	Appearances in business	Appearances in culture	Appearances in environment	Appearances in lifeandstyle	Appearances in sport	Appearances in technology	P(W\|Buss)	P(W\|Cult)	P(W\|Env)	P(W\|LifStd)	P(W\|Sport)	P(W\|Tech)
22	aami	11	6.638092	0.0	0.0	0.0	0.0	11.0	0.0	3.061291e-08	3.061291e-08	3.061291e-08	3.061291e-08	0.000011	3.061291e-08
31	aaron	109	4.344639	1.0	12.0	5.0	14.0	72.0	5.0	1.349427e-06	1.856893e-05	8.760791e-06	2.757904e-05	0.000073	7.583757e-06
42	aback	14	6.396930	0.0	2.0	0.0	7.0	2.0	3.0	3.061291e-08	3.120332e-06	3.061291e-08	1.380483e-05	0.000002	4.562499e-06
48	abandon	92	4.514198	28.0	11.0	13.0	11.0	20.0	9.0	3.695741e-05	1.702407e-05	2.272908e-05	2.167580e-05	0.000020	1.362627e-05
49	abandoned	194	3.768129	33.0	33.0	46.0	22.0	45.0	15.0	4.355148e-05	5.101098e-05	8.034825e-05	4.332100e-05	0.000045	2.269004e-05

	word	Appearances in culture
0	aaaaand	1
1	aacta	2
2	aaliyah	1
3	aamer	1
4	aamir	1

	word	Appearances in environment
0	aadmi	1
1	aarash	1
2	aarhus	8
3	aaron	5
4	abalone	3

	word	Appearances in technology
0	aaaaaaargh	1
1	aaaagh	1
2	aaah	1
3	aaargh	2
4	aabel	1