
Sentiment Search Engine
Sentiment Search Engine is a Web Based app that for any given query fetches tweets from Twitter performs sentiment analysis on them and classify them as positive or negative.
import json
import nltk
import random
import re
import collections
import timeit
import pickle
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score
p_shifters = ['no' , 'not' , 'nor' , 'neither' , 'never' , 't' , 'nt' ,'aint' , 'but' , 'only', 'onli']
# s_list = set(stopwords.words('english'))
# with open('pickleStopwords.txt' , 'wb') as fl:
# pickle.dump(s_list , fl)
with open('pickleStopwords.txt' , 'rb') as fl:
s_list = pickle.load(fl)
# stemmer = PorterStemmer()
# with open('pickleStemmer.txt' , 'wb') as fl:
# pickle.dump(stemmer , fl)
with open('pickleStemmer.txt' , 'rb') as fl:
stemmer = pickle.load(fl)
# lemmatizer = WordNetLemmatizer()
# with open('pickleLemmatizer.txt' , 'wb') as fl:
# pickle.dump(lemmatizer , fl)
with open('pickleLemmatizer.txt' , 'rb') as fl:
lemmatizer = pickle.load(fl)
# tokenizer = RegexpTokenizer(r'\w+')
# with open('pickleTokenizer.txt' , 'wb') as fl:
# pickle.dump(tokenizer , fl)
with open('pickleTokenizer.txt' , 'rb') as fl:
tokenizer = pickle.load(fl)
document = []
features = []
feature_set = []
n_matrix = []
labels = []
training_set = []
training_labels = []
testing_set = []
testing_labels = []
print("Reading Data (This will take time) - - - ")
file = open('Cell_Phones_and_Accessories_5.json','r')
# file = open('dummyJson.json','r')
startT = timeit.default_timer()
for l in file.readlines():
filtered = []
raw = json.loads(l)
txt = raw['reviewText']
txt = re.sub(r'\d+' , '' , txt)
txt = txt.lower()
txt = tokenizer.tokenize(txt)
for w in txt:
if w in p_shifters:
filtered.append(lemmatizer.lemmatize(stemmer.stem(w)))
features.append(lemmatizer.lemmatize(stemmer.stem(w)))
elif w not in s_list:
filtered.append(lemmatizer.lemmatize(stemmer.stem(w)))
features.append(lemmatizer.lemmatize(stemmer.stem(w)))
else:
pass
if raw['overall'] > 3:
document.append([filtered , 'pos'])
elif raw['overall'] < 3:
document.append([filtered , 'neg'])
else:
document.append([filtered , 'neu'])
endT = timeit.default_timer()
with open('details.txt','a') as fl:
details = "Data Read Time is : " + str((endT - startT)/60) + '\n'
fl.write(details)
random.shuffle(document)
features = collections.Counter(features)
for l in list(features.most_common(1000)):
feature_set.append(l[0])
with open('pickleFeatures.txt' , 'wb') as fl:
pickle.dump(feature_set , fl)
print("Making Numeric Matrix (This will take time) - - - ")
startT = timeit.default_timer()
for l in document:
count = collections.Counter(l[0])
data = []
for w in feature_set:
data.append(count[w])
n_matrix.append(data)
labels.append(l[1])
endT = timeit.default_timer()
with open('details.txt','a') as fl:
details = "Making Numeric Matrix Time : " + str((endT - startT)/60) + '\n'
fl.write(details)
training_set = n_matrix[:48000]
training_labels = labels[:48000]
testing_set = n_matrix[48000:]
testing_labels = labels[48000:]
classifier = svm.SVC(kernel='linear', decision_function_shape='ovr')
# with open('pickleClassifier.txt' , 'rb') as fl:
# classifier = pickle.load(fl)
print('Training SVM (This will take time) - - -')
startT = timeit.default_timer()
classifier.fit(training_set , training_labels)
endT = timeit.default_timer()
with open('details.txt','a') as fl:
details = "Training at 1000 features and 48000 instances Time is : " + str((endT - startT)/60) + '\n'
fl.write(details)
with open('pickleClassifier.txt', 'wb') as fl:
pickle.dump(classifier , fl)
print('Testing SVM - - -')
startT = timeit.default_timer()
predicted = classifier.predict(testing_set)
endT = timeit.default_timer()
with open('details.txt','a') as fl:
details = "Testing at 2000 samples Time is : " + str((endT - startT)/60) + '\n'
fl.write(details)
print("SVM Accuracy is : " , accuracy_score(testing_labels , predicted) * 100 , '\n\n')
with open('details.txt','a') as fl:
details = "SVM Accuracy is : " + str(accuracy_score(testing_labels , predicted) * 100) + '\n'
fl.write(details)
fl.write('=================================================\n')