Click here to Skip to main content
15,891,409 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
Python
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from KaggleWord2VecUtility import KaggleWord2VecUtility
import pandas as pd
import numpy as np
import logging

from nltk.corpus import stopwords # imports various modules for string cleaning
import re # remove punctuations and numbers
import nltk.data

from bs4 import BeautifulSoup # removes stopwords

from gensim.models import Word2Vec

from gensim.models.wrappers.fasttext import FastText
from gensim.models.keyedvectors import KeyedVectors

import codecs

# nltk.download()

# Read data from files
train = pd.read_csv( "labeledTrainData.tsv", header=0,
                    delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0,
                              delimiter="\t", quoting=3 )

print("Read %d labeled train reviews, %d labeled test reviews, " \
"and %d unlabled reviews \n" %
(train["review"].size,
 test["review"].size,
 unlabeled_train["review"].size))

def review_to_words(raw_review):
    # remove HTML
    review_text = BeautifulSoup(raw_review).get_text()
    # remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # convert to lower case and to individual words in a list
    words = letters_only.lower().split()
    # convert words to a set
    stops = set(stopwords.words("english"))
    # remove the stop words
    meaningful_words = [w for w in words if not w in stops]
    # join the words back itno one string separated by space and return the result
    return(" ".join(meaningful_words))

clean_review = review_to_words(train["review"][0])
print(clean_review)

num_reviews = train["review"].size
clean_train_reviews = []


for i in range(0, num_reviews):
    # if the index is evenly divisble by 1000, print a message
    if((i + 1) % 1000 == 0):
        print("Review %d of %d\n" % (i + 1, num_reviews))
        clean_review = review_to_words(train["review"][i])
        clean_train_reviews.append(clean_review)



vectorizer = CountVectorizer(analyzer = "word", \
                             tokenizer = None, \
                             preprocessor = None, \
                             stop_words = None, \
                             max_features = 5000)


# fit_transform: fits the model and learns the vocab; then transforms our training data
# into feature vectors. The input to fit_transform should be a list of strings
train_data_features = vectorizer.fit_transform(clean_train_reviews)
np.asarray(train_data_features)
print(train_data_features.shape)

print("Training the random forest...")
forest = RandomForestClassifier(n_estimators = 500)
forest = forest.fit(train_data_features, train["sentiment"])

# Testing 
clean_test_reviews = []

print("Cleaning and parsing test set movie reviews...\n")
for i in xrange(0, len(test["review"])):
    clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_words(test["review"][i], True)))

test_data_features = vectorizer.transform(clean_test_reviews)
np.asarray(test_data_features)

result = forest.predict(test_data_features) 




# copy the results to a pandas dataframe with 
# an "id" column 
# sentiment column 
output = pd.DataFrame(data = {"id":test["id"], "sentiment":result})
output.to_csv("model.csv", index = False, quoting = 3)

# Function to convert a document into a sequence of words
# Optionally removing stop words
# Returns a list of words
def review_to_wordlist(review, remove_stopwords=False ):
    # Removes HTML
    review_text = BeautifulSoup(review).get_text()
    # Removes non-letters
    review_text = re.sub("[^a-zA-Z]","", review_text)
    # Converts words to lower case and splits them by whitecase to an array
    words = review_text.lower().split()
    # Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    # Returns the list of words
    return(words)

# Loads the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle');

# Defines a function to split a review into parsed sentences
# splits a reiew into parsed sentences. Returns a list of sentences, where each
# sentence is a list of words
def review_to_sentences(review, tokenizer, remove_stopwords=False ):
    raw_sentences = tokenizer.tokenize(review.strip()) # remove .decode() 
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, \
             remove_stopwords ))
    return sentences

sentences = [] # initialize an empty list of sentences for the review_to_sentences()

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print(len(sentences))
print(sentences[0])

# Training Model
# Some C and Java word2vec tools are known to truncate the strings at byte boundaries, 
# which can result in cutting a multi-byte utf8 character in half, making it non-valid utf8
# leading to this error -> Unicode codec can't decode bytes in position (x)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s' , \
	level = logging.INFO);

num_features = 360 # word vector dimesionality
min_word_count = 40 # minimum word count
num_workers = 4 # number of threads to run in parallel
context = 10 # context window size
downsampling = 1e-3 # downsample setting for frequent words

print("Training model...")

""" model = Word2Vec(sentences, workers = num_workers, \
	size = num_features, min_count = min_word_count, \
	window = context, sample = downsampling, seed = 1)  """

embedding_dict = KeyedVectors.load_word2vec_format(sentences, binary = False, encoding='utf-8') #, unicode_errors='ignore')
embedding_dict = save_word2vec_format(sentences+".bin", binary = True)
embedding_dict = KeyedVectors.load_word2vec_format(sentences+".bin", binary = True)  

print("workkkk")
print("test 123" + '\n')
print(embedding_dict.most_similar_to_given('snake', ['pie', 'animal', 'vase', 'pizza']))

""" embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName, binary=False) 
embedding_dict.save_word2vec_format(dictFileName+".bin", binary=True) 
embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName+".bin", binary=True) """


What I have tried:

Compiling the code using Python's Tensorflow with various helper vectorization libraries yields this error:
ValueError: Number of labels=25000 does not match number of samples=25


I'm not sure how exactly to change the x or y fitting values?

Thanks in advance for the assistance.
Posted
Updated 10-Jan-19 1:14am
v3
Comments
Richard MacCutchan 10-Jan-19 4:54am    
Please edit your question, add proper <pre> tags around your code, and indicate exactly where the error occurs.

1 solution

I suggest comment out as much as possible and compile to make sure a codeless app copiles correctly, and then start adding code back a little at a time, and compile after each part is added back. Do this until the copile fails, and you've at least narrowed down your problem and can concentrate on the code that caused thefailure.
 
Share this answer
 

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900