import pandas as pd
data = pd.read_csv('IMDB_sample.csv')[['review', 'label']]
data.head()


data.value_counts('label')

label
0    3782
1    3719
dtype: int64


(data.value_counts('label') / len(data)) * 100

label
0    50.419944
1    49.580056
dtype: float64


# Creamos una nueva columna en la cual almacenamos la longitud
# de la cadena de cada resegnia
data['long'] = data['review'].apply(lambda x: len(x))
data['long']

0        667
1       2982
2        669
3        691
4       1087
        ... 
7496     633
7497     650
7498     654
7499     918
7500     336
Name: long, Length: 7501, dtype: int64


# Obtenemos la longitud maxima de la resegnia
data.long.max()

10321


# veamos explicitamente cual es dicha resegnia
data[data['long'] == data.long.max()]


# De manera alternativa al procedimiento anterior, de
# la clase str utilizaremos el metodo len() para calcular
# la longitud de cada resegnia
data.review.str.len()

0        667
1       2982
2        669
3        691
4       1087
        ... 
7496     633
7497     650
7498     654
7499     918
7500     336
Name: review, Length: 7501, dtype: int64


# Extraemos el valor minimo
data.review.str.len().min()

52


# Obtenemos explicitamente la resegnia de menor longitud
data[data['long'] == data.review.str.len().min()]


# Importacion necesaria
from textblob import TextBlob

# Texto
text = "Today was a good day."

# Instanciamos sobre el texto anterior
valence = TextBlob(text)

# Accedemos a la puntuacion de valencia mediante
# el atributo sentiment
valence.sentiment

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)


# veamos explicitamente cual es dicha resegnia
data[data['long'] == data.long.max()]


# Convertimos a cadena de texto la resegnia
titanic = str(data.iloc[4930][0])
# Veamos los primero 1000 caracteres
titanic[:1000]

'Titanic directed by James Cameron presents a fictional love story on the historical setting of the Titanic. The plot is simple, noncomplicated, or not for those who love plots that twist and turn and keep you in suspense. The end of the movie can be figured out within minutes of the start of the film, but the love story is an interesting one, however. Kate Winslett is wonderful as Rose, an aristocratic young lady betrothed by Cal (Billy Zane). Early on the voyage Rose meets Jack (Leonardo DiCaprio), a lower class artist on his way to America after winning his ticket aboard Titanic in a poker game. If he wants something, he goes and gets it unlike the upper class who are so concerned with their social worries. The two fall in love and the audience sees the sinking of the Titanic primarily through their eyes.<br /><br />The movie begins in modern times with the exploration of the wreck by a group searching for treasures, that sunk with the Titanic, which has recently occurred. One of the'


import re

# Patron de busqueda
pattern = re.compile(r'<br /><br />')
match = pattern.search(titanic)

# Veamos la coincidencia hallada
print(match)

<re.Match object; span=(818, 830), match='<br /><br />'>


# Sustituimos todos los <br /><br /> por espacios en blanco
titanic_clean = pattern.sub(' ', titanic)
titanic_clean[:1000]

'Titanic directed by James Cameron presents a fictional love story on the historical setting of the Titanic. The plot is simple, noncomplicated, or not for those who love plots that twist and turn and keep you in suspense. The end of the movie can be figured out within minutes of the start of the film, but the love story is an interesting one, however. Kate Winslett is wonderful as Rose, an aristocratic young lady betrothed by Cal (Billy Zane). Early on the voyage Rose meets Jack (Leonardo DiCaprio), a lower class artist on his way to America after winning his ticket aboard Titanic in a poker game. If he wants something, he goes and gets it unlike the upper class who are so concerned with their social worries. The two fall in love and the audience sees the sinking of the Titanic primarily through their eyes. The movie begins in modern times with the exploration of the wreck by a group searching for treasures, that sunk with the Titanic, which has recently occurred. One of the survivors '


# Instanciamos sobre el texto anterior
blob_titanic = TextBlob(titanic_clean)

# Accedemos a la puntuacion de valencia mediante
# el atributo sentiment
blob_titanic.sentiment

Sentiment(polarity=0.20791734011246213, subjectivity=0.453315898193947)


# Polaridad
blob_titanic.sentiment[0]

0.20791734011246213


# Subjetividad
blob_titanic.sentiment[1]

0.453315898193947


# Importacion necesaria
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Instanciamos un objeto WordCloud
word_cloud = WordCloud()

# Generamos la nube de palabras mediante el metodo generate()
# aplicado a la resegnia mas larga
cloud_titanic = word_cloud.generate(titanic_clean)


plt.imshow(cloud_titanic, interpolation='bilinear')
# Configuramos que los ejes no sean visibles
plt.axis('off')
plt.show()


# Instanciamos un objeto WordCloud cambiando el color de fondo
word_cloud = WordCloud(background_color='white')

cloud_titanic = word_cloud.generate(titanic_clean)

# Graficamos
plt.imshow(cloud_titanic, interpolation='bilinear')
# Configuramos que los ejes no sean visibles
plt.axis('off')
plt.show()


# Creamos un set de las palabras vacias del ingles
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Instanciamos un objeto WordCloud con fondo blanco y especificando
# que queremos quitar las palabras vacias a la hora de crear la
# nube de palabras
word_cloud = WordCloud(background_color='white', stopwords=stop_words)

cloud_titanic = word_cloud.generate(titanic_clean)

# Graficamos
plt.imshow(cloud_titanic, interpolation='bilinear')
# Configuramos que los ejes no sean visibles
plt.axis('off')
plt.show()

	review	label
0	This short spoof can be found on Elite's Mille...	0
1	A singularly unfunny musical comedy that artif...	0
2	An excellent series, masterfully acted and dir...	1
3	The master of movie spectacle Cecil B. De Mill...	1
4	I was gifted with this movie as it had such a ...	0

pros	contras
Pueden relevar lo esencial	A veces tiende a no funcionar bien
Tratarán todas las palabras en un texto	Todas las palabras trazadas en la nube de palabras pueden parecer no relacionadas
Un escaneo rápido de la imagen puede proporcionar un sentido general del texto	Podría ser difícil sacar una conclusión basada en una nube de palabras abarrotada
Son fáciles de entender y visualmente estéticos	Si el texto es largo, la nube de palabras puede requerir bastante preprocesamiento previo

Procesamiento de lenguaje natural¶

Análisis de sentimientos¶

Introducción ¶

Exploración de los datos ¶

Algoritmos para el análisis de sentimientos ¶

Métricas ¶

Nube de palabras ¶