import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from KaggleWord2VecUtility import KaggleWord2VecUtility
import pandas as pd
import numpy as np
import logging
from nltk.corpus import stopwords
import re
import nltk.data
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from gensim.models.wrappers.fasttext import FastText
from gensim.models.keyedvectors import KeyedVectors
import codecs
train = pd.read_csv( "labeledTrainData.tsv", header=0,
delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0,
delimiter="\t", quoting=3 )
print("Read %d labeled train reviews, %d labeled test reviews, " \
"and %d unlabled reviews \n" %
(train["review"].size,
test["review"].size,
unlabeled_train["review"].size))
def review_to_words(raw_review):
review_text = BeautifulSoup(raw_review).get_text()
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
words = letters_only.lower().split()
stops = set(stopwords.words("english"))
meaningful_words = [w for w in words if not w in stops]
return(" ".join(meaningful_words))
clean_review = review_to_words(train["review"][0])
print(clean_review)
num_reviews = train["review"].size
clean_train_reviews = []
for i in range(0, num_reviews):
if((i + 1) % 1000 == 0):
print("Review %d of %d\n" % (i + 1, num_reviews))
clean_review = review_to_words(train["review"][i])
clean_train_reviews.append(clean_review)
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 5000)
train_data_features = vectorizer.fit_transform(clean_train_reviews)
np.asarray(train_data_features)
print(train_data_features.shape)
print("Training the random forest...")
forest = RandomForestClassifier(n_estimators = 500)
forest = forest.fit(train_data_features, train["sentiment"])
clean_test_reviews = []
print("Cleaning and parsing test set movie reviews...\n")
for i in xrange(0, len(test["review"])):
clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_words(test["review"][i], True)))
test_data_features = vectorizer.transform(clean_test_reviews)
np.asarray(test_data_features)
result = forest.predict(test_data_features)
output = pd.DataFrame(data = {"id":test["id"], "sentiment":result})
output.to_csv("model.csv", index = False, quoting = 3)
def review_to_wordlist(review, remove_stopwords=False ):
review_text = BeautifulSoup(review).get_text()
review_text = re.sub("[^a-zA-Z]","", review_text)
words = review_text.lower().split()
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
return(words)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle');
def review_to_sentences(review, tokenizer, remove_stopwords=False ):
raw_sentences = tokenizer.tokenize(review.strip())
sentences = []
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(review_to_wordlist(raw_sentence, \
remove_stopwords ))
return sentences
sentences = []
print("Parsing sentences from training set")
for review in train["review"]:
sentences += review_to_sentences(review, tokenizer)
print("Parsing sentences from unlabled set")
for review in unlabeled_train["review"]:
sentences += review_to_sentences(review, tokenizer)
print(len(sentences))
print(sentences[0])
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s' , \
level = logging.INFO);
num_features = 360
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3
print("Training model...")
""" model = Word2Vec(sentences, workers = num_workers, \
size = num_features, min_count = min_word_count, \
window = context, sample = downsampling, seed = 1) """
embedding_dict = KeyedVectors.load_word2vec_format(sentences, binary = False, encoding='utf-8')
embedding_dict = save_word2vec_format(sentences+".bin", binary = True)
embedding_dict = KeyedVectors.load_word2vec_format(sentences+".bin", binary = True)
print("workkkk")
print("test 123" + '\n')
print(embedding_dict.most_similar_to_given('snake', ['pie', 'animal', 'vase', 'pizza']))
""" embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName, binary=False)
embedding_dict.save_word2vec_format(dictFileName+".bin", binary=True)
embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName+".bin", binary=True) """
What I have tried:
Compiling the code using Python's Tensorflow with various helper vectorization libraries yields this error:
ValueError: Number of labels=25000 does not match number of samples=25
I'm not sure how exactly to change the x or y fitting values?
Thanks in advance for the assistance.