Value error for randomforestclassifer desperate help

Question

0.00/5 (No votes)

See more:

Python

import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from KaggleWord2VecUtility import KaggleWord2VecUtility
import pandas as pd
import numpy as np
import logging

from nltk.corpus import stopwords # imports various modules for string cleaning
import re # remove punctuations and numbers
import nltk.data

from bs4 import BeautifulSoup # removes stopwords

from gensim.models import Word2Vec

from gensim.models.wrappers.fasttext import FastText
from gensim.models.keyedvectors import KeyedVectors

import codecs

# nltk.download()

# Read data from files
train = pd.read_csv( "labeledTrainData.tsv", header=0,
                    delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0,
                              delimiter="\t", quoting=3 )

print("Read %d labeled train reviews, %d labeled test reviews, " \
"and %d unlabled reviews \n" %
(train["review"].size,
 test["review"].size,
 unlabeled_train["review"].size))

def review_to_words(raw_review):
    # remove HTML
    review_text = BeautifulSoup(raw_review).get_text()
    # remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # convert to lower case and to individual words in a list
    words = letters_only.lower().split()
    # convert words to a set
    stops = set(stopwords.words("english"))
    # remove the stop words
    meaningful_words = [w for w in words if not w in stops]
    # join the words back itno one string separated by space and return the result
    return(" ".join(meaningful_words))

clean_review = review_to_words(train["review"][0])
print(clean_review)

num_reviews = train["review"].size
clean_train_reviews = []


for i in range(0, num_reviews):
    # if the index is evenly divisble by 1000, print a message
    if((i + 1) % 1000 == 0):
        print("Review %d of %d\n" % (i + 1, num_reviews))
        clean_review = review_to_words(train["review"][i])
        clean_train_reviews.append(clean_review)



vectorizer = CountVectorizer(analyzer = "word", \
                             tokenizer = None, \
                             preprocessor = None, \
                             stop_words = None, \
                             max_features = 5000)


# fit_transform: fits the model and learns the vocab; then transforms our training data
# into feature vectors. The input to fit_transform should be a list of strings
train_data_features = vectorizer.fit_transform(clean_train_reviews)
np.asarray(train_data_features)
print(train_data_features.shape)

print("Training the random forest...")
forest = RandomForestClassifier(n_estimators = 500)
forest = forest.fit(train_data_features, train["sentiment"])

# Testing 
clean_test_reviews = []

print("Cleaning and parsing test set movie reviews...\n")
for i in xrange(0, len(test["review"])):
    clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_words(test["review"][i], True)))

test_data_features = vectorizer.transform(clean_test_reviews)
np.asarray(test_data_features)

result = forest.predict(test_data_features) 




# copy the results to a pandas dataframe with 
# an "id" column 
# sentiment column 
output = pd.DataFrame(data = {"id":test["id"], "sentiment":result})
output.to_csv("model.csv", index = False, quoting = 3)

# Function to convert a document into a sequence of words
# Optionally removing stop words
# Returns a list of words
def review_to_wordlist(review, remove_stopwords=False ):
    # Removes HTML
    review_text = BeautifulSoup(review).get_text()
    # Removes non-letters
    review_text = re.sub("[^a-zA-Z]","", review_text)
    # Converts words to lower case and splits them by whitecase to an array
    words = review_text.lower().split()
    # Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    # Returns the list of words
    return(words)

# Loads the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle');

# Defines a function to split a review into parsed sentences
# splits a reiew into parsed sentences. Returns a list of sentences, where each
# sentence is a list of words
def review_to_sentences(review, tokenizer, remove_stopwords=False ):
    raw_sentences = tokenizer.tokenize(review.strip()) # remove .decode() 
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, \
             remove_stopwords ))
    return sentences

sentences = [] # initialize an empty list of sentences for the review_to_sentences()

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print(len(sentences))
print(sentences[0])

# Training Model
# Some C and Java word2vec tools are known to truncate the strings at byte boundaries, 
# which can result in cutting a multi-byte utf8 character in half, making it non-valid utf8
# leading to this error -> Unicode codec can't decode bytes in position (x)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s' , \
	level = logging.INFO);

num_features = 360 # word vector dimesionality
min_word_count = 40 # minimum word count
num_workers = 4 # number of threads to run in parallel
context = 10 # context window size
downsampling = 1e-3 # downsample setting for frequent words

print("Training model...")

""" model = Word2Vec(sentences, workers = num_workers, \
	size = num_features, min_count = min_word_count, \
	window = context, sample = downsampling, seed = 1)  """

embedding_dict = KeyedVectors.load_word2vec_format(sentences, binary = False, encoding='utf-8') #, unicode_errors='ignore')
embedding_dict = save_word2vec_format(sentences+".bin", binary = True)
embedding_dict = KeyedVectors.load_word2vec_format(sentences+".bin", binary = True)  

print("workkkk")
print("test 123" + '\n')
print(embedding_dict.most_similar_to_given('snake', ['pie', 'animal', 'vase', 'pizza']))

""" embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName, binary=False) 
embedding_dict.save_word2vec_format(dictFileName+".bin", binary=True) 
embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName+".bin", binary=True) """

What I have tried:

Compiling the code using Python's Tensorflow with various helper vectorization libraries yields this error:

ValueError: Number of labels=25000 does not match number of samples=25

I'm not sure how exactly to change the x or y fitting values?

Thanks in advance for the assistance.

Posted 9-Jan-19 21:41pm

Member 14113783

Updated 10-Jan-19 1:14am

#realJSOP

v3

Add a Solution

Comments

Richard MacCutchan 10-Jan-19 4:54am

Please edit your question, add proper <pre> tags around your code, and indicate exactly where the error occurs.

1 solution

Add a Solution

Add your solution here

Treat my content as plain text, not as HTML

Preview 0

…

Existing Members

Sign in to your account

...or Join us

Download, Vote, Comment, Publish.

Your Email
Password
Forgot your password?

Your Email
This email is in use. Do you need your password?
Optional Password

I have read and agree to the Terms of Service and Privacy Policy
Please subscribe me to the CodeProject newsletters

When answering a question please:

Read the question carefully.
Understand that English isn't everyone's first language so be lenient of bad spelling and grammar.
If a question is poorly phrased then either ask for clarification, ignore it, or edit the question and fix the problem. Insults are not welcome.
Don't tell someone to read the manual. Chances are they have and don't get it. Provide an answer or move on to the next question.

Let's work to help developers, not make them feel stupid.

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)

#realJSOP · Answer 1 · 2019-01-10T01:14:00

I suggest comment out as much as possible and compile to make sure a codeless app copiles correctly, and then start adding code back a little at a time, and compile after each part is added back. Do this until the copile fails, and you've at least narrowed down your problem and can concentrate on the code that caused thefailure.