import pandas as pd
import numpy as np
data_df = pd.read_csv("/content/BBC News Train.csv")
data_df.head()
data_df.dropna()
data_df.isnull().sum()
from sklearn.model_selection import train_test_split
train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names,
test_label_names = train_test_split(np.array(data_df['Text']), np.array(data_df['Category']),
np.array(data_df['Category']), test_size=0.33)
train_corpus.shape, test_corpus.shape
nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokenized_train = [word_tokenize(text) for text in train_corpus]
tokenized_test = [word_tokenize(text) for text in test_corpus]
import gensim
w2v_num_features = 1000
w2v_model = gensim.models.Word2Vec(tokenized_train, window=100, min_count=2,
sample=1e-3, sg=1,workers=10)
def document_vectorizer(corpus, model, num_features):
vocabulary = set(model.wv.index_to_key)
def average_word_vectors(words, model, vocabulary, num_features):
feature_vector = np.zeros((num_features,), dtype="float64")
nwords = 0.
for word in words:
if word in vocabulary:
nwords = nwords + 1.
word_vector = model.wv[word]
if len(word_vector) == num_features:
feature_vector = np.add(feature_vector, word_vector)
if nwords:
feature_vector = np.divide(feature_vector, nwords)
return feature_vector
features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
for tokenized_sentence in corpus]
return np.array(features)
avg_wv_train_features = document_vectorizer(corpus=tokenized_train, model=w2v_model,
num_features=w2v_num_features)
avg_wv_test_features = document_vectorizer(corpus=tokenized_test, model=w2v_model,
num_features=w2v_num_features)
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape,
' Test features shape:', avg_wv_test_features.shape)
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier(loss='hinge', penalty='l2', random_state=42, max_iter=500)
svm.fit(avg_wv_train_features, train_label_names)
svm_w2v_cv_scores = cross_val_score(svm, avg_wv_train_features, train_label_names)
svm_w2v_cv_mean_score = np.mean(svm_w2v_cv_scores)
svm_w2v_test_score = svm.score(avg_wv_test_features, test_label_names)
