Ошибка значения для randomforestclassifer отчаянная помощь
import os from sklearn.feature_extraction.text import CountVectorizer from sklearn.ensemble import RandomForestClassifier from KaggleWord2VecUtility import KaggleWord2VecUtility import pandas as pd import numpy as np import logging from nltk.corpus import stopwords # imports various modules for string cleaning import re # remove punctuations and numbers import nltk.data from bs4 import BeautifulSoup # removes stopwords from gensim.models import Word2Vec from gensim.models.wrappers.fasttext import FastText from gensim.models.keyedvectors import KeyedVectors import codecs # nltk.download() # Read data from files train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 ) test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 ) unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 ) print("Read %d labeled train reviews, %d labeled test reviews, " \ "and %d unlabled reviews \n" % (train["review"].size, test["review"].size, unlabeled_train["review"].size)) def review_to_words(raw_review): # remove HTML review_text = BeautifulSoup(raw_review).get_text() # remove non-letters letters_only = re.sub("[^a-zA-Z]", " ", review_text) # convert to lower case and to individual words in a list words = letters_only.lower().split() # convert words to a set stops = set(stopwords.words("english")) # remove the stop words meaningful_words = [w for w in words if not w in stops] # join the words back itno one string separated by space and return the result return(" ".join(meaningful_words)) clean_review = review_to_words(train["review"][0]) print(clean_review) num_reviews = train["review"].size clean_train_reviews = [] for i in range(0, num_reviews): # if the index is evenly divisble by 1000, print a message if((i + 1) % 1000 == 0): print("Review %d of %d\n" % (i + 1, num_reviews)) clean_review = review_to_words(train["review"][i]) clean_train_reviews.append(clean_review) vectorizer = CountVectorizer(analyzer = "word", \ tokenizer = None, \ preprocessor = None, \ stop_words = None, \ max_features = 5000) # fit_transform: fits the model and learns the vocab; then transforms our training data # into feature vectors. The input to fit_transform should be a list of strings train_data_features = vectorizer.fit_transform(clean_train_reviews) np.asarray(train_data_features) print(train_data_features.shape) print("Training the random forest...") forest = RandomForestClassifier(n_estimators = 500) forest = forest.fit(train_data_features, train["sentiment"]) # Testing clean_test_reviews = [] print("Cleaning and parsing test set movie reviews...\n") for i in xrange(0, len(test["review"])): clean_test_reviews.append(" ".join(KaggleWord2VecUtility.review_to_words(test["review"][i], True))) test_data_features = vectorizer.transform(clean_test_reviews) np.asarray(test_data_features) result = forest.predict(test_data_features) # copy the results to a pandas dataframe with # an "id" column # sentiment column output = pd.DataFrame(data = {"id":test["id"], "sentiment":result}) output.to_csv("model.csv", index = False, quoting = 3) # Function to convert a document into a sequence of words # Optionally removing stop words # Returns a list of words def review_to_wordlist(review, remove_stopwords=False ): # Removes HTML review_text = BeautifulSoup(review).get_text() # Removes non-letters review_text = re.sub("[^a-zA-Z]","", review_text) # Converts words to lower case and splits them by whitecase to an array words = review_text.lower().split() # Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] # Returns the list of words return(words) # Loads the punkt tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle'); # Defines a function to split a review into parsed sentences # splits a reiew into parsed sentences. Returns a list of sentences, where each # sentence is a list of words def review_to_sentences(review, tokenizer, remove_stopwords=False ): raw_sentences = tokenizer.tokenize(review.strip()) # remove .decode() sentences = [] for raw_sentence in raw_sentences: if len(raw_sentence) > 0: sentences.append(review_to_wordlist(raw_sentence, \ remove_stopwords )) return sentences sentences = [] # initialize an empty list of sentences for the review_to_sentences() print("Parsing sentences from training set") for review in train["review"]: sentences += review_to_sentences(review, tokenizer) print("Parsing sentences from unlabled set") for review in unlabeled_train["review"]: sentences += review_to_sentences(review, tokenizer) print(len(sentences)) print(sentences[0]) # Training Model # Some C and Java word2vec tools are known to truncate the strings at byte boundaries, # which can result in cutting a multi-byte utf8 character in half, making it non-valid utf8 # leading to this error -> Unicode codec can't decode bytes in position (x) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s' , \ level = logging.INFO); num_features = 360 # word vector dimesionality min_word_count = 40 # minimum word count num_workers = 4 # number of threads to run in parallel context = 10 # context window size downsampling = 1e-3 # downsample setting for frequent words print("Training model...") """ model = Word2Vec(sentences, workers = num_workers, \ size = num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed = 1) """ embedding_dict = KeyedVectors.load_word2vec_format(sentences, binary = False, encoding='utf-8') #, unicode_errors='ignore') embedding_dict = save_word2vec_format(sentences+".bin", binary = True) embedding_dict = KeyedVectors.load_word2vec_format(sentences+".bin", binary = True) print("workkkk") print("test 123" + '\n') print(embedding_dict.most_similar_to_given('snake', ['pie', 'animal', 'vase', 'pizza'])) """ embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName, binary=False) embedding_dict.save_word2vec_format(dictFileName+".bin", binary=True) embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName+".bin", binary=True) """
Что я уже пробовал:
Компиляция кода с использованием Tensorflow Python с различными вспомогательными библиотеками векторизации приводит к этой ошибке:
ValueError: Number of labels=25000 does not match number of samples=25
Я не знаю, как именно изменить значения подгонки x или y?
Заранее благодарю за помощь.
Richard MacCutchan
Пожалуйста, отредактируйте свой вопрос, добавьте соответствующие теги <pre> вокруг вашего кода и укажите точно, где происходит ошибка.