Member 14113783 Ответов: 1

Ошибка значения для randomforestclassifer отчаянная помощь


import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from KaggleWord2VecUtility import KaggleWord2VecUtility
import pandas as pd
import numpy as np
import logging

from nltk.corpus import stopwords # imports various modules for string cleaning
import re # remove punctuations and numbers
import nltk.data

from bs4 import BeautifulSoup # removes stopwords

from gensim.models import Word2Vec

from gensim.models.wrappers.fasttext import FastText
from gensim.models.keyedvectors import KeyedVectors

import codecs

# nltk.download()

# Read data from files
train = pd.read_csv( "labeledTrainData.tsv", header=0,
                    delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0,
                              delimiter="\t", quoting=3 )

print("Read %d labeled train reviews, %d labeled test reviews, " \
"and %d unlabled reviews \n" %
(train["review"].size,
 test["review"].size,
 unlabeled_train["review"].size))

def review_to_words(raw_review):
    # remove HTML
    review_text = BeautifulSoup(raw_review).get_text()
    # remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # convert to lower case and to individual words in a list
    words = letters_only.lower().split()
    # convert words to a set
    stops = set(stopwords.words("english"))
    # remove the stop words
    meaningful_words = [w for w in words if not w in stops]
    # join the words back itno one string separated by space and return the result
    return(" ".join(meaningful_words))

clean_review = review_to_words(train["review"][0])
print(clean_review)

num_reviews = train["review"].size
clean_train_reviews = []


for i in range(0, num_reviews):
    # if the index is evenly divisble by 1000, print a message
    if((i + 1) % 1000 == 0):
        print("Review %d of %d\n" % (i + 1, num_reviews))
        clean_review = review_to_words(train["review"][i])
        clean_train_reviews.append(clean_review)



vectorizer = CountVectorizer(analyzer = "word", \
                             tokenizer = None, \
                             preprocessor = None, \
                             stop_words = None, \
                             max_features = 5000)


# fit_transform: fits the model and learns the vocab; then transforms our training data
# into feature vectors. The input to fit_transform should be a list of strings
train_data_features = vectorizer.fit_transform(clean_train_reviews)
np.asarray(train_data_features)
print(train_data_features.shape)

print("Training the random forest...")
forest = RandomForestClassifier(n_estimators = 500)
forest = forest.fit(train_data_features, train["sentiment"])

# Testing 
clean_test_reviews = []

print("Cleaning and parsing test set movie reviews...\n")
for i in xrange(0, len(test["review"])):
    clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_words(test["review"][i], True)))

test_data_features = vectorizer.transform(clean_test_reviews)
np.asarray(test_data_features)

result = forest.predict(test_data_features) 




# copy the results to a pandas dataframe with 
# an "id" column 
# sentiment column 
output = pd.DataFrame(data = {"id":test["id"], "sentiment":result})
output.to_csv("model.csv", index = False, quoting = 3)

# Function to convert a document into a sequence of words
# Optionally removing stop words
# Returns a list of words
def review_to_wordlist(review, remove_stopwords=False ):
    # Removes HTML
    review_text = BeautifulSoup(review).get_text()
    # Removes non-letters
    review_text = re.sub("[^a-zA-Z]","", review_text)
    # Converts words to lower case and splits them by whitecase to an array
    words = review_text.lower().split()
    # Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    # Returns the list of words
    return(words)

# Loads the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle');

# Defines a function to split a review into parsed sentences
# splits a reiew into parsed sentences. Returns a list of sentences, where each
# sentence is a list of words
def review_to_sentences(review, tokenizer, remove_stopwords=False ):
    raw_sentences = tokenizer.tokenize(review.strip()) # remove .decode() 
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, \
             remove_stopwords ))
    return sentences

sentences = [] # initialize an empty list of sentences for the review_to_sentences()

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print(len(sentences))
print(sentences[0])

# Training Model
# Some C and Java word2vec tools are known to truncate the strings at byte boundaries, 
# which can result in cutting a multi-byte utf8 character in half, making it non-valid utf8
# leading to this error -> Unicode codec can't decode bytes in position (x)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s' , \
	level = logging.INFO);

num_features = 360 # word vector dimesionality
min_word_count = 40 # minimum word count
num_workers = 4 # number of threads to run in parallel
context = 10 # context window size
downsampling = 1e-3 # downsample setting for frequent words

print("Training model...")

""" model = Word2Vec(sentences, workers = num_workers, \
	size = num_features, min_count = min_word_count, \
	window = context, sample = downsampling, seed = 1)  """

embedding_dict = KeyedVectors.load_word2vec_format(sentences, binary = False, encoding='utf-8') #, unicode_errors='ignore')
embedding_dict = save_word2vec_format(sentences+".bin", binary = True)
embedding_dict = KeyedVectors.load_word2vec_format(sentences+".bin", binary = True)  

print("workkkk")
print("test 123" + '\n')
print(embedding_dict.most_similar_to_given('snake', ['pie', 'animal', 'vase', 'pizza']))

""" embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName, binary=False) 
embedding_dict.save_word2vec_format(dictFileName+".bin", binary=True) 
embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName+".bin", binary=True) """


Что я уже пробовал:

Компиляция кода с использованием Tensorflow Python с различными вспомогательными библиотеками векторизации приводит к этой ошибке:
ValueError: Number of labels=25000 does not match number of samples=25


Я не знаю, как именно изменить значения подгонки x или y?

Заранее благодарю за помощь.

Richard MacCutchan

Пожалуйста, отредактируйте свой вопрос, добавьте соответствующие теги <pre> вокруг вашего кода и укажите точно, где происходит ошибка.

1 Ответов

Рейтинг:
1

#realJSOP

Я предлагаю закомментировать как можно больше и скомпилировать, чтобы убедиться, что бескодовое приложение копируется правильно, а затем начать добавлять код обратно понемногу и компилировать после того, как каждая часть будет добавлена обратно. Делайте это до тех пор, пока не произойдет сбой копиля, и вы, по крайней мере, сузите круг своей проблемы и сможете сосредоточиться на коде, вызвавшем сбой.