# Importacion necesaria
from sklearn.linear_model import LogisticRegression

# Mandamos a llamar a la funcion de regresion logistica
# y creamos un objeto clasificador logistico.
# Ademas, ajustamos el modelo para las variables previamente
# definidas X y y
log_reg = LogisticRegression().fit(X,y)

# Importacion necesaria
from sklearn.model_selection import train_test_split

# train_test_split toma como argumento matrices, listas o dataframes
X_train, y_train, X_test, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=123,
                                                    stratify=y)

# Importacion necesaria
from sklearn.metrica import confusion_matrix

# Construccion de la regresion logistica y prediccion de las
# etiquetas en el conjunto de prueba
log_reg = LogisticRegression().fic(X_train, y_train)
y_pred = log_reg.predict(X_train)

# Matriz de confusion
print(confusion_matrix(y_test, y_pred) / len(y_test))

### Ejemplo de salida
[[0.3782 0.1222]
 [0.1351 0.3635]]


import pandas as pd
data = pd.read_csv('amazon_reviews_sample.csv')[['score', 'review']]
data.head()


data


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   score   10000 non-null  int64 
 1   review  10000 non-null  object
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


data_positive = pd.DataFrame(data[data.score == 1].review)
data_positive


# Veamos la primer resegnia
data_positive.iloc[0][0]

' Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\r\n'


import re

# Patron de busqueda
# Quitamos los <br> /><br> /> o quitamos todos los caracteres que no son
# alfanumericos o todos los guiones bajos
pattern = re.compile(r'<br /><br />|[^\w]|_')

# Sustituimos todos los <br /><br /> por espacios en blanco
data_positive.review = data_positive.review.apply(lambda x: pattern.sub(' ', x))

# Veamos la primer resegnia
data_positive.iloc[0][0]

' Stuning even for the non gamer  This sound track was beautiful  It paints the senery in your mind so well I would recomend it even to people who hate vid  game music  I have played the game Chrono Cross but out of all of the games I have ever played it has the best music  It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras  It would impress anyone who cares to listen       '


data_positive.iloc[1][0]

' The best soundtrack ever to anything   I m reading a lot of reviews saying that this is the best  game soundtrack  and I figured that I d write a review to disagree a bit  This in my opinino is Yasunori Mitsuda s ultimate masterpiece  The music is timeless and I m been listening to it for years now and its beauty simply refuses to fade The price tag on this is pretty staggering I must say  but if you are going to buy any cd for this much money  this is the only one that I feel would be worth every penny   '


positive_reviews = ' '
for i in range(len(data_positive)):
    positive_reviews += data_positive.iloc[i][0]
positive_reviews[:1000]

'  Stuning even for the non gamer  This sound track was beautiful  It paints the senery in your mind so well I would recomend it even to people who hate vid  game music  I have played the game Chrono Cross but out of all of the games I have ever played it has the best music  It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras  It would impress anyone who cares to listen        The best soundtrack ever to anything   I m reading a lot of reviews saying that this is the best  game soundtrack  and I figured that I d write a review to disagree a bit  This in my opinino is Yasunori Mitsuda s ultimate masterpiece  The music is timeless and I m been listening to it for years now and its beauty simply refuses to fade The price tag on this is pretty staggering I must say  but if you are going to buy any cd for this much money  this is the only one that I feel would be worth every penny    Amazing   This soundtrack is my favorite music of all tim'


len(positive_reviews)

2112327


# Si aparace un espacio en blanco una o mas veces
pattern = re.compile(r' +')
# lo sustituiremos por solo un espacio en blanco
positive_reviews2 = pattern.sub(' ', positive_reviews)
positive_reviews2[:1000]

' Stuning even for the non gamer This sound track was beautiful It paints the senery in your mind so well I would recomend it even to people who hate vid game music I have played the game Chrono Cross but out of all of the games I have ever played it has the best music It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras It would impress anyone who cares to listen The best soundtrack ever to anything I m reading a lot of reviews saying that this is the best game soundtrack and I figured that I d write a review to disagree a bit This in my opinino is Yasunori Mitsuda s ultimate masterpiece The music is timeless and I m been listening to it for years now and its beauty simply refuses to fade The price tag on this is pretty staggering I must say but if you are going to buy any cd for this much money this is the only one that I feel would be worth every penny Amazing This soundtrack is my favorite music of all time hands down The intense sa'


len(positive_reviews2)

2044858


# Importacion necesaria
from wordcloud import WordCloud
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))

# Instanciamos un objeto WordCloud cambiando el color de fondo
# y actuando sobre las resegnias positivas limpias
cloud_positive = WordCloud(background_color='white').generate(positive_reviews2)

# Graficamos
plt.imshow(cloud_positive, interpolation='bilinear')
# Configuramos que los ejes no sean visibles
plt.axis('off')
plt.show()


# Importamos ademas la lista predefinida de palabras vacias
# del ingles de sklearn
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Podemos enriquecer, o agregar, mas palabras vacias a la lista predefinida
# de las palabras vacias de sklearn, por ejemplo:
stop_words = ENGLISH_STOP_WORDS.union(['book', 'song',' people', 'product', 
                                       'video', 'movie', 'read', 'music',
                                       'CD', 'film', 'song', 'DVD',
                                      'album', 'day', 'books', 'time'])

# Instanciamos un objeto WordCloud cambiando el color de fondo
# y actuando sobre las resegnias positivas limpias
cloud_positive = WordCloud(background_color='white', 
                           stopwords=stop_words).generate(positive_reviews2)

# Graficamos
plt.figure(figsize=(10, 6))
plt.imshow(cloud_positive, interpolation='bilinear')
# Configuramos que los ejes no sean visibles
plt.axis('off')
plt.show()


data_positive.head()


# Importacion necesaria
from nltk.corpus.reader.tagged import word_tokenize
# Ejemplo
word_tokenize('hello how are you'.lower())

['hello', 'how', 'are', 'you']


# Tokenizacion
data_positive['tokens'] = data_positive.review.apply(lambda x: word_tokenize(x.lower()))
data_positive.head()


# Importacion necesaria
from nltk.stem import WordNetLemmatizer

# Instanciamos
lemattizer = WordNetLemmatizer()

# Crearemos una funcion auxiliar
def lematizar(tokens):
    tokens_lemmatizados = []
    for token in tokens:
        tokens_lemmatizados.append(lemattizer.lemmatize(token))
    return tokens_lemmatizados

# Creamos una columna nueva con los tokens lematizados
data_positive['tokens_lemm'] = data_positive.tokens.apply(lambda x: lematizar(x))
data_positive.head()


# Creamos una columna nueva con la longitud de cada lista de tokens
data_positive['long'] = data_positive.tokens.apply(lambda x: len(x))
data_positive.head()


data_positive.sort_values('long')[['review', 'long']]


import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer

# Construimos el vectorizador configuramos el set de las palabras
# vacias que ya teniamos. Tambien, configuramos la consideracion de 
# unigramas y bigramas. Establecemos un maximo de frecuencia 
# para las caracteristicas de termino. Y ajustamos sobre todas las 
# resegnias
vect = TfidfVectorizer(stop_words=stop_words, ngram_range=(1,2),
                      max_features=200).fit(data.review)

# Creamos la matriz dispersa
X = vect.transform(data.review)

# Creamos un dataframe 
reviews_transformed = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())
reviews_transformed.head()


reviews_transformed.shape

(10000, 200)


data.shape

(10000, 2)


data.score

0       1
1       1
2       1
3       1
4       1
       ..
9995    1
9996    1
9997    0
9998    0
9999    1
Name: score, Length: 10000, dtype: int64


reviews_transformed['score'] = data.score
reviews_transformed.head()


# Importacion necesaria
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

# Definimos las variables en cuestion
y = np.array(reviews_transformed.score).reshape(-1, 1)
X = np.array(reviews_transformed.drop('score', axis=1))

# Hacemos la division de los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, random_state=123) 
# Entrenamos un modelo de regresion logistica
log_reg = LogisticRegression().fit(X_train, y_train)

# Realizamos la prediccion de las etiquetas
y_pred = log_reg.predict(X_test)

# Imprimimos la puntuacion de precision y la matriz de confusion
print('Precisión del modelo en el conjunto de prueba: ', accuracy_score(y_test, y_pred))
print()
print(confusion_matrix(y_test, y_pred) / len(y_test))

Precisión del modelo en el conjunto de prueba:  0.794

[[0.414  0.1005]
 [0.1055 0.38  ]]


# Resegnia nueva
res = 'The book of Don Quixote is a spectacular, beautiful and sublime work of Cervantes'

# Creamos la matriz dispersa con el vocabulario y los valores asociados
# ya almacenados de nuestro modelo, referente a la nueva resegnia
X = vect.transform([res])

# Creamos un dataframe para ver la matriz dispersa
reviews_transformed_new = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())
reviews_transformed_new.head()


# Realizamos la prediccion del sentiminto
log_reg.predict(reviews_transformed_new)

array([1], dtype=int64)

.	Valor real = 1	Valor real = 0
Predicción = 1	Verdadero positivo	Falso positivo
Predicción = 0	Falso negativo	Verdadero negativo

	score	review
0	1	Stuning even for the non-gamer: This sound tr...
1	1	The best soundtrack ever to anything.: I'm re...
2	1	Amazing!: This soundtrack is my favorite musi...
3	1	Excellent Soundtrack: I truly like this sound...
4	1	Remember, Pull Your Jaw Off The Floor After H...

	score	review
0	1	Stuning even for the non-gamer: This sound tr...
1	1	The best soundtrack ever to anything.: I'm re...
2	1	Amazing!: This soundtrack is my favorite musi...
3	1	Excellent Soundtrack: I truly like this sound...
4	1	Remember, Pull Your Jaw Off The Floor After H...
...	...	...
9995	1	A revelation of life in small town America in...
9996	1	Great biography of a very interesting journal...
9997	0	Interesting Subject; Poor Presentation: You'd...
9998	0	Don't buy: The box looked used and it is obvi...
9999	1	Beautiful Pen and Fast Delivery.: The pen was...

	review
0	Stuning even for the non-gamer: This sound tr...
1	The best soundtrack ever to anything.: I'm re...
2	Amazing!: This soundtrack is my favorite musi...
3	Excellent Soundtrack: I truly like this sound...
4	Remember, Pull Your Jaw Off The Floor After H...
...	...
9990	Psychological thriller!: This movie really sc...
9991	A little more money than what I expected to s...
9995	A revelation of life in small town America in...
9996	Great biography of a very interesting journal...
9999	Beautiful Pen and Fast Delivery.: The pen was...

	review
0	Stuning even for the non gamer This sound tr...
1	The best soundtrack ever to anything I m re...
2	Amazing This soundtrack is my favorite musi...
3	Excellent Soundtrack I truly like this sound...
4	Remember Pull Your Jaw Off The Floor After H...

Procesamiento de lenguaje natural¶

Análisis de sentimientos¶

Proyecto¶

Desglose de temas¶

Proyecto 0¶

Recapitulación¶

	review	tokens	tokens_lemm
0	Stuning even for the non gamer This sound tr...	[stuning, even, for, the, non, gamer, this, so...	[Stuning, even, for, the, non, gamer, This, so...
1	The best soundtrack ever to anything I m re...	[the, best, soundtrack, ever, to, anything, i,...	[The, best, soundtrack, ever, to, anything, I,...
2	Amazing This soundtrack is my favorite musi...	[amazing, this, soundtrack, is, my, favorite, ...	[Amazing, This, soundtrack, is, my, favorite, ...
3	Excellent Soundtrack I truly like this sound...	[excellent, soundtrack, i, truly, like, this, ...	[Excellent, Soundtrack, I, truly, like, this, ...
4	Remember Pull Your Jaw Off The Floor After H...	[remember, pull, your, jaw, off, the, floor, a...	[Remember, Pull, Your, Jaw, Off, The, Floor, A...

	review	long
881	Good read ...	4
8887	First Knight Cassette Excellent service p...	15
7654	Question of thought How would Nietzsche crit...	15
7932	excellent Original fun breathless ...	15
7951	great subsitute for sony memory stick Great ...	16
...	...	...
2271	It ll get in your head I saw the video f...	203
6576	I love it Ray J has soul I love this cd so ...	205
303	Fun To Listen To This is the second best Ca...	208
4313	Has Streisand stayed too long at the fair NO...	208
8257	cabin worthy Live in New Mexico and have a s...	215

	amazing	away	...	work	worth	years
0	0.000000	0.296352	...	0.000000	0.000000	0.000000
1	0.000000	0.000000	...	0.000000	0.224690	0.213913
2	0.432583	0.000000	...	0.161535	0.180921	0.172243
3	0.000000	0.000000	...	0.000000	0.000000	0.000000
4	0.000000	0.000000	...	0.000000	0.000000	0.000000