import pandas as pd

data = pd.read_csv('amazon_reviews_sample.csv')[['score', 'review']]
data.head()


# Importacion necesaria
from sklearn.feature_extraction.text import CountVectorizer

# Instanciamos un objeto CountVectorizer.
# Ademas, max_features hara que solo se consideren
# las caracteristicas con mayor frecuencia de terminos, esto es
# se seleccionaran las palabras mas frecuentes en el corpus de resegnias
vect = CountVectorizer(max_features = 1000)

# Ajustamos a nuestro texto, el cual es en particular las
# resegnias del dataframe
vect.fit(data.review)

# Para crear la representacion de bolsa de palabras llamamos al metodo
# transform()
X = vect.transform(data.review)

X

<10000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 406668 stored elements in Compressed Sparse Row format>


array_matrix = X.toarray()
array_matrix

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)


X_df = pd.DataFrame(array_matrix, columns = vect.get_feature_names())
X_df

C:\ProgramData\Miniconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)


# Considereremos unigramas y bigramas
vect = CountVectorizer(max_features = 1000, ngram_range=(1, 2))

# Ajustamos a nuestro texto, el cual es en particular las
# resegnias del dataframe
vect.fit(data.review)

# Para crear la representacion de bolsa de palabras llamamos al metodo
# transform()
X = vect.transform(data.review)

# Creamos un dataframe
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df

C:\ProgramData\Miniconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)


# Veamos de nuevo el dataframe
data


# Obversemos una resegnia en particular
data.review[0]

' Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\r\n'


# Realizaremos una pequenia limpieza considerando solo los caracteres
# alfanumericos
import re

pattern = re.compile(r'[^\w+]')

# Todos los caracteres que no son lafanumericos los sustituimos por 
# espacios en blanco
match = pattern.sub(' ', data.review[0])

match

' Stuning even for the non gamer  This sound track was beautiful  It paints the senery in your mind so well I would recomend it even to people who hate vid  game music  I have played the game Chrono Cross but out of all of the games I have ever played it has the best music  It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras  It would impress anyone who cares to listen   _   '


from nltk import word_tokenize

match_tokens = word_tokenize(match) 
match_tokens

['Stuning',
 'even',
 'for',
 'the',
 'non',
 'gamer',
 'This',
 'sound',
 'track',
 'was',
 'beautiful',
 'It',
 'paints',
 'the',
 'senery',
 'in',
 'your',
 'mind',
 'so',
 'well',
 'I',
 'would',
 'recomend',
 'it',
 'even',
 'to',
 'people',
 'who',
 'hate',
 'vid',
 'game',
 'music',
 'I',
 'have',
 'played',
 'the',
 'game',
 'Chrono',
 'Cross',
 'but',
 'out',
 'of',
 'all',
 'of',
 'the',
 'games',
 'I',
 'have',
 'ever',
 'played',
 'it',
 'has',
 'the',
 'best',
 'music',
 'It',
 'backs',
 'away',
 'from',
 'crude',
 'keyboarding',
 'and',
 'takes',
 'a',
 'fresher',
 'step',
 'with',
 'grate',
 'guitars',
 'and',
 'soulful',
 'orchestras',
 'It',
 'would',
 'impress',
 'anyone',
 'who',
 'cares',
 'to',
 'listen',
 '_']


pattern = re.compile(r'[^\w+]')

# Mediante una funcion lambda y apply, limpiamos cada resegnia
data['tokens'] = data.review.apply(lambda x: pattern.sub(' ', x))

# Mediante una funcion lambda y apply, tokenizamos cada resegnia limpia
data['tokens'] = data.tokens.apply(lambda x: word_tokenize(x))

data


data.tokens[0]

['Stuning',
 'even',
 'for',
 'the',
 'non',
 'gamer',
 'This',
 'sound',
 'track',
 'was',
 'beautiful',
 'It',
 'paints',
 'the',
 'senery',
 'in',
 'your',
 'mind',
 'so',
 'well',
 'I',
 'would',
 'recomend',
 'it',
 'even',
 'to',
 'people',
 'who',
 'hate',
 'vid',
 'game',
 'music',
 'I',
 'have',
 'played',
 'the',
 'game',
 'Chrono',
 'Cross',
 'but',
 'out',
 'of',
 'all',
 'of',
 'the',
 'games',
 'I',
 'have',
 'ever',
 'played',
 'it',
 'has',
 'the',
 'best',
 'music',
 'It',
 'backs',
 'away',
 'from',
 'crude',
 'keyboarding',
 'and',
 'takes',
 'a',
 'fresher',
 'step',
 'with',
 'grate',
 'guitars',
 'and',
 'soulful',
 'orchestras',
 'It',
 'would',
 'impress',
 'anyone',
 'who',
 'cares',
 'to',
 'listen',
 '_']


data['long'] = data.tokens.apply(lambda x: len(x))
data


# Importacion necesaria.
# La funcion detect_langs nos ayudara a detectar el idioma
# de una cadena de texto
from langdetect import detect_langs

# Cadena en idioma espagnol
cad = 'Hola mundo, estoy escribiendo en español.'

# Aplicamos la funcion detect_langs a la cadena anterior
detect_langs(cad)

[es:0.999996656110993]


# Cadena en idioma ingles
cad = 'Hello world, I am writing in Spanish.'

# Aplicamos la funcion detect_langs a la cadena anterior
detect_langs(cad)

[en:0.9999958712167164]


# Creamos una nueva columna para colocar el probable idioma
# detectado en cada una de las resegnias
data['lang'] = data.review.apply(lambda x: detect_langs(x))
data


# Accedemos a cada elemento de la columna lang, luego accedemos al 
# primer elemento de la lista resultante de deteccion del idioma 
# (el primer elemento es el idioma mas probable detectado).
# Despues convertivos a cadena de texto cada elemento de dicha columna
# y accedemos a los dos primeros caracteres
data['lang_exp'] = data.lang.apply(lambda x: str(x[0])[:2])
data


# Tendremos una resegnia en italiano
data[data.lang_exp == 'it']


# Tendremos 8 resegnias en frances
data[data.lang_exp == 'fr']


# Tendremos 14 resegnias en espagnol
data[data.lang_exp == 'es']


data.review.iloc[6234]

' 5+ stars. LO MEJOR DE LO QUE HE LEIDO EN MI VIDA.: Un resúmen:El clan del oso cavernario: EXCELENTE, INSPIRADOR.Un libro para leer y releer, es como un libro de superación personal, pero sin los aburridos consejos sabiondos de los autores, ni las falsas promesas de los nuevos autores llamados "new age". DEBE LEERLO, Y POR FAVOR, NO LO PRESTE.....!El valle de los caballos: BUENA SECUELA, RECOMENDABLE.Los cazadores del mamut: SOLO PARA FANS.El libro de los viajes ("Plains"): REPETITIVO, LENTO Y ABURRIDO EN DOS TERCERAS PARTES (LA\'ÚLTIMA PARTE, COMO EN "cazadores", PARA FANS).....LA ESPERANZA...."The Shelter of stone"...donde los que admiramos a Ayla, esperamos reencontrarnos con ella...\r\n'


# Importacion necesaria
from textblob import TextBlob

# Instancias sobre el texto de la resegnia
valence = TextBlob(data.review.iloc[6234])

# Accedemos a la puntuacion de valencia mediante
# el atributo sentiment
valence.sentiment

Sentiment(polarity=0.06818181818181818, subjectivity=0.37727272727272726)

	score	review
0	1	Stuning even for the non-gamer: This sound tr...
1	1	The best soundtrack ever to anything.: I'm re...
2	1	Amazing!: This soundtrack is my favorite musi...
3	1	Excellent Soundtrack: I truly like this sound...
4	1	Remember, Pull Your Jaw Off The Floor After H...

	score	review
0	1	Stuning even for the non-gamer: This sound tr...
1	1	The best soundtrack ever to anything.: I'm re...
2	1	Amazing!: This soundtrack is my favorite musi...
3	1	Excellent Soundtrack: I truly like this sound...
4	1	Remember, Pull Your Jaw Off The Floor After H...
...	...	...
9995	1	A revelation of life in small town America in...
9996	1	Great biography of a very interesting journal...
9997	0	Interesting Subject; Poor Presentation: You'd...
9998	0	Don't buy: The box looked used and it is obvi...
9999	1	Beautiful Pen and Fast Delivery.: The pen was...

	score	review	tokens
0	1	Stuning even for the non-gamer: This sound tr...	[Stuning, even, for, the, non, gamer, This, so...
1	1	The best soundtrack ever to anything.: I'm re...	[The, best, soundtrack, ever, to, anything, I,...
2	1	Amazing!: This soundtrack is my favorite musi...	[Amazing, This, soundtrack, is, my, favorite, ...
3	1	Excellent Soundtrack: I truly like this sound...	[Excellent, Soundtrack, I, truly, like, this, ...
4	1	Remember, Pull Your Jaw Off The Floor After H...	[Remember, Pull, Your, Jaw, Off, The, Floor, A...
...	...	...	...
9995	1	A revelation of life in small town America in...	[A, revelation, of, life, in, small, town, Ame...
9996	1	Great biography of a very interesting journal...	[Great, biography, of, a, very, interesting, j...
9997	0	Interesting Subject; Poor Presentation: You'd...	[Interesting, Subject, Poor, Presentation, You...
9998	0	Don't buy: The box looked used and it is obvi...	[Don, t, buy, The, box, looked, used, and, it,...
9999	1	Beautiful Pen and Fast Delivery.: The pen was...	[Beautiful, Pen, and, Fast, Delivery, The, pen...

	score	review	tokens	long
0	1	Stuning even for the non-gamer: This sound tr...	[Stuning, even, for, the, non, gamer, This, so...	81
1	1	The best soundtrack ever to anything.: I'm re...	[The, best, soundtrack, ever, to, anything, I,...	102
2	1	Amazing!: This soundtrack is my favorite musi...	[Amazing, This, soundtrack, is, my, favorite, ...	136
3	1	Excellent Soundtrack: I truly like this sound...	[Excellent, Soundtrack, I, truly, like, this, ...	122
4	1	Remember, Pull Your Jaw Off The Floor After H...	[Remember, Pull, Your, Jaw, Off, The, Floor, A...	90
...	...	...	...	...
9995	1	A revelation of life in small town America in...	[A, revelation, of, life, in, small, town, Ame...	157
9996	1	Great biography of a very interesting journal...	[Great, biography, of, a, very, interesting, j...	143
9997	0	Interesting Subject; Poor Presentation: You'd...	[Interesting, Subject, Poor, Presentation, You...	112
9998	0	Don't buy: The box looked used and it is obvi...	[Don, t, buy, The, box, looked, used, and, it,...	29
9999	1	Beautiful Pen and Fast Delivery.: The pen was...	[Beautiful, Pen, and, Fast, Delivery, The, pen...	105

	score	review	lang
0	1	Stuning even for the non-gamer: This sound tr...	[en:0.9999970045393871]
1	1	The best soundtrack ever to anything.: I'm re...	[en:0.9999969331701792]
2	1	Amazing!: This soundtrack is my favorite musi...	[en:0.999996523966582]
3	1	Excellent Soundtrack: I truly like this sound...	[en:0.9999977616627801]
4	1	Remember, Pull Your Jaw Off The Floor After H...	[en:0.9999966030055435]
...	...	...	...
9995	1	A revelation of life in small town America in...	[en:0.9999975756246933]
9996	1	Great biography of a very interesting journal...	[en:0.9999950445580532]
9997	0	Interesting Subject; Poor Presentation: You'd...	[en:0.999998331084728]
9998	0	Don't buy: The box looked used and it is obvi...	[en:0.999998492368484]
9999	1	Beautiful Pen and Fast Delivery.: The pen was...	[en:0.9999975843869945]

Procesamiento de lenguaje natural¶

Análisis de sentimientos¶

Bolsa de Palabras ¶

n-gramas ¶

Construcción de características adicionales ¶

Detección del lenguaje ¶

	score	review	lang	lang_exp
2316	1	Nightwish is unique and rocks for eva: Moi to...	[fr:0.9999970676897462]	fr
2760	0	Completement nul: Fait sur commande et ennuya...	[fr:0.9999972821811058]	fr
4914	0	De la poudre aux yeux: J'ai acheté un Sansa V...	[fr:0.9999964131374507]	fr
5720	1	C'est magnifique! il y a du vrai dans ce qui'...	[fr:0.9999973622432383]	fr
5875	1	Erreur: "Les Triplettes de Belleville" n'a pa...	[fr:0.9999954875413937]	fr
6631	0	certains bugs viennent tout gacher: le jeu es...	[fr:0.9999986981092219]	fr
7983	1	1F4T: Cet album est chanté vraiment bien. Jea...	[fr:0.9999971781286289]	fr
8340	1	Jean de Florette et Manon des sources: bien a...	[fr:0.9999981019283888]	fr

	score	review	lang	lang_exp
1259	1	La reencarnación vista por un científico: El ...	[es:0.9999953923802931]	es
1261	1	Magnifico libro: Brian Weiss ha dejado una ma...	[es:0.9999942254581136]	es
1745	1	Excelente!: Una excelente guía para todos aqu...	[es:0.9999977246887979]	es
2486	1	Palabras de aliento para tu caminar con Dios:...	[es:0.9999948990649543]	es
2903	1	fabuloso: mil gracias por el producto fabulos...	[es:0.9999968848809118]	es
3318	1	Excelentes botas.. excelentes boots: Excelent...	[es:0.9999957667748436]	es
3694	0	Why not Spanish ???: Alguien me puede decir p...	[es:0.9999956602589468]	es
4820	1	La mejor película de Moore: A mi juicio, esta...	[es:0.9999950238615187]	es
5901	1	Buen cargador: Product very good, I am of Ven...	[es:0.8571387835958204, en:0.1428601546048478]	es
6234	1	5+ stars. LO MEJOR DE LO QUE HE LEIDO EN MI V...	[es:0.9999942069437358]	es
7078	1	Variedad: Bueno tener este album debido a su ...	[es:0.9999959479067406]	es
8018	1	Exelente eleccion: Los mejores zapatos de fut...	[es:0.9999961554846727]	es
9265	1	Excelente: Manu es una de los mejores cantant...	[es:0.9999956344575924]	es
9624	0	baaaaaadddddddd bookkkkkkk: por favor no gast...	[es:0.9999957024195795]	es