# Importaciones necesarias
import pandas as pd
import matplotlib.pyplot as plt
# Importamos las palabras vacias (del ingles) que trae
# la libreria wordcloud
from wordcloud import WordCloud, STOPWORDS

# Cargamos los datos
data = pd.read_csv('IMDB_sample.csv')[['review', 'label']]


# Definimos un set con las palabras vacias
stopwords = set(STOPWORDS)


# Agregaremos algunas palabras vacias, a nuestra consideracion
# y de acuerdo al contexto del problema, al set que creamos antes.
stopwords.update(['movie', 'movies', 'film', 'films', 'watch', 'br'])


# Convertimos a cadena de texto la resegnia 4930
titanic = str(data.iloc[4930][0])

# Instanciamos un objeto WordCloud cambiando el color de fondo
# y configurando que no se consideren las palabras vacias,
# las cuales definimos antes
word_cloud = WordCloud(background_color='white', stopwords=stopwords)

cloud_titanic = word_cloud.generate(titanic)

# Graficamos
plt.imshow(cloud_titanic, interpolation='bilinear')

# Configuramos que los ejes no sean visibles
plt.axis('off')
plt.show()


# Importamos ademas la lista predefinida de palabras vacias
# del ingles de sklearn
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

# Podemos enriquecer, o agregar, mas palabras vacias a la lista predefinida
# de las palabras vacias de sklearn, por ejemplo:
stopwords = ENGLISH_STOP_WORDS.union(['film', 'movie', 'cinema', 'theatre'])


# Instanciamos un objeto CountVectorizer configurando
# las palabras vacias que definimos antes
vect = CountVectorizer(stop_words=stopwords)

# Ajustamos a nuestro texto, el cual es en particular las
# resegnias del dataframe
vect.fit(data.review)

# Para crear la representacion de bolsa de palabras llamamos al metodo
# transform()
X = vect.transform(data.review)


X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df

C:\ProgramData\Miniconda3\lib\site-packages\sklearn\utils\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)


# Primero tokenizamos las resegnias
from nltk import word_tokenize

data['tokens'] = data.review.apply(lambda x: word_tokenize(x))
data.head()


data['tokens_letras'] = data.tokens.apply(lambda tokens: [token for token in tokens if token.isalpha()])
data.head()


# Vemos que al final hay numeros
data.iloc[4][0]

"I was gifted with this movie as it had such a great premise, the friendship of three women bespoiled by one falling in love with a younger man.<br /><br />Intriguing.<br /><br />NOT! I hasten to add. These women are all drawn in extreme caricature, not very supportive of one another and conspiring and contriving to bring each other down.<br /><br />Anna Chancellor and Imelda Staunton could do no wrong in my book prior to seeing this, but here they are handed a dismal script and told to balance the action between slapstick and screwball, which doesn't work too well when the women are all well known professionals in a very small town.<br /><br />And for intelligent women they spend a whole pile of time bemoaning the lack of men/sex/lust in their lives. I felt much more could have been made of it given a decent script and more tension, the lesbian sub-plot went nowhere and those smoking/drinking women (all 3 in their forties???) were very unrealistic - even in the baby scene - screw the baby, gimme a cigarette! Right.<br /><br />Like I said, a shame of a waste. 4 out of 10."


# Consideramos los tokens de la columna tokens_letras
# Vemos que ahi no se consideraron los numeros
rev = ''
for l in data.iloc[4][3]:
    rev += l + ' '
rev

'I was gifted with this movie as it had such a great premise the friendship of three women bespoiled by one falling in love with a younger br br br br NOT I hasten to add These women are all drawn in extreme caricature not very supportive of one another and conspiring and contriving to bring each other br br Anna Chancellor and Imelda Staunton could do no wrong in my book prior to seeing this but here they are handed a dismal script and told to balance the action between slapstick and screwball which does work too well when the women are all well known professionals in a very small br br And for intelligent women they spend a whole pile of time bemoaning the lack of in their lives I felt much more could have been made of it given a decent script and more tension the lesbian went nowhere and those women all in their forties were very unrealistic even in the baby scene screw the baby gim me a cigarette br br Like I said a shame of a waste out of '

CountVectorizer(token_pattern = <patron>)


# Importacion necesaria
from nltk.stem import PorterStemmer

# Instanciamos
porter = PorterStemmer()


porter.stem('houses')

'hous'


# Utilizaremos la derivacion para palabras en espagnol
from nltk.stem.snowball import SnowballStemmer

# Instanciamos colocando el idioma
esStemmer = SnowballStemmer('spanish')

# Por ejemplo
esStemmer.stem('jugando')

'jug'


# O tambien derivando para palabras del aleman

# Instanciamos colocando el idioma
alStemmer = SnowballStemmer('dutch')

# Por ejemplo
alStemmer.stem('beginnen')

'beginn'


# Importacion necesaria
from nltk.stem import WordNetLemmatizer

# Instanciamos
wnlemm =  WordNetLemmatizer()

# Por ejemplo
wnlemm.lemmatize('houses')

'house'


# Instanciamos
wnl =  WordNetLemmatizer()

# Tomamos la lista de tokens de cada resegnia, luego, retornaremos de nuevo una lista
# de tokens, pero dichos tokens estaran lematizados
data['tokens_lemma'] = data.tokens.apply(lambda l_tokens: [wnl.lemmatize(token) for token in l_tokens])
data.head()


# Notemos, en particular, la palabras sensibilities
data.iloc[1][0][:250]

"A singularly unfunny musical comedy that artificially tries to marry the then-cutting edge rock 'n' roll explosion with the middle-class sensibilities of a suburban sitcom. The result is a jarringly dated mish-mash that will satisfy none of the audie"


# Notamos ahora que, referente a la cuarta columna sobre los tokens 
# lematizados, que la palabra sensibilities ha sido cambiada
# por sensibility
rev = ''
for l in data.iloc[1][4]:
    rev += l + ' '
rev[:250]

"A singularly unfunny musical comedy that artificially try to marry the then-cutting edge rock 'n ' roll explosion with the middle-class sensibility of a suburban sitcom . The result is a jarringly dated mish-mash that will satisfy none of the audienc"


# Importacion de TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


data = pd.read_csv('tweets.csv')
data.head()


data.text

0                      @VirginAmerica What @dhepburn said.
1        @VirginAmerica plus you've added commercials t...
2        @VirginAmerica I didn't today... Must mean I n...
3        @VirginAmerica it's really aggressive to blast...
4        @VirginAmerica and it's a really big bad thing...
                               ...                        
14635    @AmericanAir thank you we got on a different f...
14636    @AmericanAir leaving over 20 minutes Late Flig...
14637    @AmericanAir Please bring American Airlines to...
14638    @AmericanAir you have my money, you change my ...
14639    @AmericanAir we have 8 ppl so we need 2 know h...
Name: text, Length: 14640, dtype: object


data.text[4]

"@VirginAmerica and it's a really big bad thing about it"


vect = TfidfVectorizer(max_features=100).fit(data.text)
X = vect.transform(data.text)

X

<14640x100 sparse matrix of type '<class 'numpy.float64'>'
	with 119182 stored elements in Compressed Sparse Row format>


import warnings
warnings.filterwarnings('ignore')

X_df = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())
X_df

	review	label	tokens
0	This short spoof can be found on Elite's Mille...	0	[This, short, spoof, can, be, found, on, Elite...
1	A singularly unfunny musical comedy that artif...	0	[A, singularly, unfunny, musical, comedy, that...
2	An excellent series, masterfully acted and dir...	1	[An, excellent, series, ,, masterfully, acted,...
3	The master of movie spectacle Cecil B. De Mill...	1	[The, master, of, movie, spectacle, Cecil, B, ...
4	I was gifted with this movie as it had such a ...	0	[I, was, gifted, with, this, movie, as, it, ha...

	review	label	tokens	tokens_letras
0	This short spoof can be found on Elite's Mille...	0	[This, short, spoof, can, be, found, on, Elite...	[This, short, spoof, can, be, found, on, Elite...
1	A singularly unfunny musical comedy that artif...	0	[A, singularly, unfunny, musical, comedy, that...	[A, singularly, unfunny, musical, comedy, that...
2	An excellent series, masterfully acted and dir...	1	[An, excellent, series, ,, masterfully, acted,...	[An, excellent, series, masterfully, acted, an...
3	The master of movie spectacle Cecil B. De Mill...	1	[The, master, of, movie, spectacle, Cecil, B, ...	[The, master, of, movie, spectacle, Cecil, B, ...
4	I was gifted with this movie as it had such a ...	0	[I, was, gifted, with, this, movie, as, it, ha...	[I, was, gifted, with, this, movie, as, it, ha...

	review	label	tokens	tokens_letras	tokens_lemma
0	This short spoof can be found on Elite's Mille...	0	[This, short, spoof, can, be, found, on, Elite...	[This, short, spoof, can, be, found, on, Elite...	[This, short, spoof, can, be, found, on, Elite...
1	A singularly unfunny musical comedy that artif...	0	[A, singularly, unfunny, musical, comedy, that...	[A, singularly, unfunny, musical, comedy, that...	[A, singularly, unfunny, musical, comedy, that...
2	An excellent series, masterfully acted and dir...	1	[An, excellent, series, ,, masterfully, acted,...	[An, excellent, series, masterfully, acted, an...	[An, excellent, series, ,, masterfully, acted,...
3	The master of movie spectacle Cecil B. De Mill...	1	[The, master, of, movie, spectacle, Cecil, B, ...	[The, master, of, movie, spectacle, Cecil, B, ...	[The, master, of, movie, spectacle, Cecil, B, ...
4	I was gifted with this movie as it had such a ...	0	[I, was, gifted, with, this, movie, as, it, ha...	[I, was, gifted, with, this, movie, as, it, ha...	[I, wa, gifted, with, this, movie, a, it, had,...

	tweet_id	airline_sentiment	airline_sentiment_confidence	negativereason	negativereason_confidence	airline	airline_sentiment_gold	name	negativereason_gold	text	tweet_coord	tweet_created	tweet_location	user_timezone
0	570306133677760513	neutral	1.0000	NaN	NaN	Virgin America	NaN	cairdin	NaN	@VirginAmerica What @dhepburn said.	NaN	2015-02-24 11:35:52 -0800	NaN	Eastern Time (US & Canada)
1	570301130888122368	positive	0.3486	NaN	0.0000	Virgin America	NaN	jnardino	NaN	@VirginAmerica plus you've added commercials t...	NaN	2015-02-24 11:15:59 -0800	NaN	Pacific Time (US & Canada)
2	570301083672813571	neutral	0.6837	NaN	NaN	Virgin America	NaN	yvonnalynn	NaN	@VirginAmerica I didn't today... Must mean I n...	NaN	2015-02-24 11:15:48 -0800	Lets Play	Central Time (US & Canada)
3	570301031407624196	negative	1.0000	Bad Flight	0.7033	Virgin America	NaN	jnardino	NaN	@VirginAmerica it's really aggressive to blast...	NaN	2015-02-24 11:15:36 -0800	NaN	Pacific Time (US & Canada)
4	570300817074462722	negative	1.0000	Can't Tell	1.0000	Virgin America	NaN	jnardino	NaN	@VirginAmerica and it's a really big bad thing...	NaN	2015-02-24 11:14:45 -0800	NaN	Pacific Time (US & Canada)

	about	after	again	airline	all	am	americanair	amp	an	and	...	was	we	what	when	why	will	with	would	you	your
0	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.000000	0.0	0.000000	...	0.0	0.000000	0.668165	0.0	0.0	0.0	0.0	0.0	0.000000	0.000000
1	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.000000	0.0	0.000000	...	0.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.329040	0.000000
2	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.000000	0.0	0.000000	...	0.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.000000
3	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.431149	0.0	0.000000	...	0.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.332355
4	0.494872	0.0	0.0	0.0	0.0	0.0	0.000000	0.000000	0.0	0.279754	...	0.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
14635	0.000000	0.0	0.0	0.0	0.0	0.0	0.293653	0.000000	0.0	0.000000	...	0.0	0.402305	0.000000	0.0	0.0	0.0	0.0	0.0	0.268285	0.000000
14636	0.000000	0.0	0.0	0.0	0.0	0.0	0.181266	0.000000	0.0	0.000000	...	0.0	0.248334	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.000000
14637	0.000000	0.0	0.0	0.0	0.0	0.0	0.487504	0.000000	0.0	0.000000	...	0.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.000000
14638	0.000000	0.0	0.0	0.0	0.0	0.0	0.188272	0.000000	0.0	0.179597	...	0.0	0.000000	0.000000	0.0	0.0	0.0	0.0	0.0	0.344014	0.232585
14639	0.000000	0.0	0.0	0.0	0.0	0.0	0.164169	0.000000	0.0	0.000000	...	0.0	0.449822	0.000000	0.0	0.0	0.0	0.0	0.0	0.000000	0.000000

Procesamiento de lenguaje natural¶

Análisis de sentimientos¶

Bolsa de Palabras ¶

Operadores de cadena y comparación ¶

Stematización y lematización ¶

Algoritmo Tf-idf ¶

	00	000	000s	007	0080	0083	0093638	00am	00pm	00s	...	zukovic	zulu	zuniga	zvyagvatsev	zwick	zx81	zy	zzzzzzzzzzzz	zzzzzzzzzzzzz	â½
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7496	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
7497	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
7498	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
7499	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
7500	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0