import pandas as pd

df_lematizador = pd.read_csv('https://luisapaez.github.io/Teoria_Galois/Texto_procesado.csv')

# Consideraremos solo las primeras 2 columnas
df = df_lematizador[['Página', 'Párrafos']]
df


import re
import nltk
from collections import Counter
import contractions
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# Consideramos las stopswords del idioma ingles
stop_words = set(stopwords.words('english'))


def clean_string(string):
    # Utilizaremos el conjunto de stopwords creado antes 
    global stop_words
    # Consideramos solo los caracteres que son alfanumericos
    a = re.sub(r'[\W_]+',' ',
               # Quitamos las contracciones posibles de las palabras.
               # Del string en cuestion (palabras), quitamos los saltos de linea
               # y los espacio-s-espacio
               contractions.fix(string.replace('\n',' '))).replace(' s ',' ').lower() 
    # Instanciamos el lematizador
    lemmatizer = WordNetLemmatizer()
    # Lista auxiliar
    lemmatization = []
    # Dividimos las palabras de string
    for k in a.split():
        # lematizamos cada palabra
        lemmatization.append(lemmatizer.lemmatize(k))
    # Unimos las palabras lematizadas en un solo string
    a = ' '.join(lemmatization)
    
    # Lista auxiliar
    text = []
    # Tokenizamos el string _a_ lematizado
    for k in word_tokenize(a):
        # quitamos las palabras vacias
        if k not in stop_words:
            # Agregamos las palabras no vacias a la lista auxiliar text
            text.append(k)
    # Unimos de nuevo cada palabra que es no vacia (no stopword) en un
    # solo string
    a = ' '.join(text)
    return a


clean_string("Hello, I'm Saul Goodman. I'll eat cookies")

'hello saul goodman eat cooky'


cad = """Zipf's law is an empirical law, formulated using mathematical statistics,
named after the linguist George Kingsley Zipf, who first proposed it."""
clean_string(cad)

'zipf law empirical law formulated using mathematical statistic named linguist george kingsley zipf first proposed'


# Creamos una nueva columna en la cual dividimos cada parrafo por
# palabras
df['Senteces_in_tokens'] = df['Párrafos'].apply(lambda x: x.split())

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Input In [31], in <cell line: 3>()
      1 # Creamos una nueva columna en la cual dividimos cada parrafo por
      2 # palabras
----> 3 df['Senteces_in_tokens'] = df['Párrafos'].apply(lambda x: x.split())

File C:\ProgramData\Miniconda3\lib\site-packages\pandas\core\series.py:4433, in Series.apply(self, func, convert_dtype, args, **kwargs)
   4323 def apply(
   4324     self,
   4325     func: AggFuncType,
   (...)
   4328     **kwargs,
   4329 ) -> DataFrame | Series:
   4330     """
   4331     Invoke function on values of Series.
   4332 
   (...)
   4431     dtype: float64
   4432     """
-> 4433     return SeriesApply(self, func, convert_dtype, args, kwargs).apply()

File C:\ProgramData\Miniconda3\lib\site-packages\pandas\core\apply.py:1082, in SeriesApply.apply(self)
   1078 if isinstance(self.f, str):
   1079     # if we are a string, try to dispatch
   1080     return self.apply_str()
-> 1082 return self.apply_standard()

File C:\ProgramData\Miniconda3\lib\site-packages\pandas\core\apply.py:1137, in SeriesApply.apply_standard(self)
   1131         values = obj.astype(object)._values
   1132         # error: Argument 2 to "map_infer" has incompatible type
   1133         # "Union[Callable[..., Any], str, List[Union[Callable[..., Any], str]],
   1134         # Dict[Hashable, Union[Union[Callable[..., Any], str],
   1135         # List[Union[Callable[..., Any], str]]]]]"; expected
   1136         # "Callable[[Any], Any]"
-> 1137         mapped = lib.map_infer(
   1138             values,
   1139             f,  # type: ignore[arg-type]
   1140             convert=self.convert_dtype,
   1141         )
   1143 if len(mapped) and isinstance(mapped[0], ABCSeries):
   1144     # GH#43986 Need to do list(mapped) in order to get treated as nested
   1145     #  See also GH#25959 regarding EA support
   1146     return obj._constructor_expanddim(list(mapped), index=obj.index)

File C:\ProgramData\Miniconda3\lib\site-packages\pandas\_libs\lib.pyx:2870, in pandas._libs.lib.map_infer()

Input In [31], in <lambda>(x)
      1 # Creamos una nueva columna en la cual dividimos cada parrafo por
      2 # palabras
----> 3 df['Senteces_in_tokens'] = df['Párrafos'].apply(lambda x: x.split())

AttributeError: 'float' object has no attribute 'split'


# Consideraremos solo las filas de nuestro dataframe que sea del tipo str
df = df[df['Párrafos'].apply(lambda x: True if type(x)==str else False)]
# Creamos una nueva columna en la cual dividimos cada parrafo por
# palabras
df['Senteces_in_tokens'] = df['Párrafos'].apply(lambda x: x.split())
# Veamos:
df

C:\Users\usuario\AppData\Local\Temp\ipykernel_9852\3556317895.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Senteces_in_tokens'] = df['Párrafos'].apply(lambda x: x.split())


df['Longitud'] = df['Senteces_in_tokens'].apply(lambda x: len(x))
df

C:\Users\usuario\AppData\Local\Temp\ipykernel_9852\3506120534.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Longitud'] = df['Senteces_in_tokens'].apply(lambda x: len(x))


import matplotlib.pyplot as plt
# Configuramos un estilo de graficacion 
plt.style.use("seaborn")

df['Longitud'].hist()
plt.show()


df['Longitud'].describe()

count    7542.000000
mean       14.270883
std        12.512122
min         1.000000
25%         5.000000
50%         9.000000
75%        21.000000
max       128.000000
Name: Longitud, dtype: float64


import numpy as np

def normal(x, media, desviacion, top):
    # top: tamagnio maximo
    y = top * np.exp(-((x-media)/desviacion) * ((x-media)/desviacion) * 0.5)
    return y


normal(15, 15, 5, 160)

160.0


# Creamos las clases
df_class = df.groupby('Longitud')
df_class

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002B3CD661790>


print(normal(1, 15, 5, 160))
print(round(normal(1, 15, 5, 160)))

3.1745751590992475
3


print(normal(2, 15, 5, 160))
print(round(normal(2, 15, 5, 160)))

5.447592757535893
5


# Creamos un dataframe vacio
new_data_frame = pd.DataFrame()

# Interamos sobre df_class
for classes, dataframe in df_class:
    # Obtenemos el numero de documentos que debemos tomar por cada clase
    sample_size = round(normal(classes, 15, 6, 160))
    # Numero de filas del dataframe en cuestion
    size_fram = dataframe.shape[0]
    # Si el numero de filas es mayor al numero de documentos a tomar
    # entonces iremos apilando los dataframes, donde tomamos el numero de
    # documentos dados por el numero sample_size del dataframe  _dataframe_
    if size_fram > sample_size:
        new_data_frame = pd.concat([new_data_frame, dataframe.sample(sample_size)], axis=0)
    # Si tenemos menos filas que documentos que debemos tomar, entonces simplemente
    # tomamos todas las filas de dicho dataframe
    else:
        new_data_frame = pd.concat([new_data_frame, dataframe], axis=0) 
        
# Graficamos
new_data_frame['Longitud'].hist()
plt.show()


# Creamos un dataframe vacio
new_data_frame = pd.DataFrame()

# Interamos sobre df_class
for classes, dataframe in df_class:
    # 
    sample_size = round(normal(classes,15,5,160))
    size_fram = dataframe.shape[0]
    
    if size_fram > sample_size:
        new_data_frame = pd.concat([new_data_frame, dataframe.sample(sample_size)], axis=0)
    else:
        new_data_frame = pd.concat([new_data_frame, dataframe], axis=0) 
        
# Graficamos
new_data_frame['Longitud'].hist()
plt.show()


df[df['Longitud'] == 15].shape[0]

179


new_data_frame


# Realizamos la importacion necesaria
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


corpus = df['Párrafos'].to_list()


# Instanciamos
vectorizer = CountVectorizer()

# Ajustamos la vectorizacion al corpus definido antes
vectorizer.fit(corpus)

CountVectorizer()

CountVectorizer()


df['CountVectorizer']=df['Párrafos'].apply(lambda x: np.array(vectorizer.transform([x]).todense())[0])
df

C:\Users\usuario\AppData\Local\Temp\ipykernel_9852\1535703749.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CountVectorizer']=df['Párrafos'].apply(lambda x: np.array(vectorizer.transform([x]).todense())[0])


# Veamos la dimension de un vector obtenido
df['CountVectorizer'].iloc[4].shape

(11960,)


# Convertimos a una lista el vector referente de la fila 4
vector_sentence = list(df['CountVectorizer'].iloc[4])
# Parrafo en dicha fila
print(df['Párrafos'].iloc[4])
print('--------Vectorizer--------')
# Veamos las entradas no nulas de dicho vector
print([(j,vector_sentence[j]) for j in range(len(vector_sentence)) if vector_sentence[j]!=0])

warm night castle caladan ancient pile stone served atreides family home twenty six generations bore cooled sweat feeling acquired change weather
--------Vectorizer--------
[(168, 1), (413, 1), (648, 1), (1146, 1), (1385, 1), (1504, 1), (1608, 1), (2216, 1), (3821, 1), (3898, 1), (4432, 1), (5011, 1), (6844, 1), (7490, 1), (9186, 1), (9498, 1), (10072, 1), (10425, 1), (11074, 1), (11553, 1), (11639, 1)]


# Importacion necesaria
from sklearn.metrics.pairwise import cosine_similarity

# Parrafos:
print(df['Párrafos'].iloc[0])
print()
print(df['Párrafos'].iloc[1])

# Vectores de los parrafos anteriores
x = [df['CountVectorizer'].iloc[0]]
y = [df['CountVectorizer'].iloc[1]]

dune

beginning time taking delicate care balances correct every sister bene gesserit knows begin study life muad dib take care first place time born 57th year padishah emperor shaddam iv take special care locate muad dib place planet arrakis deceived fact born caladan lived first fifteen years arrakis planet known dune forever place


# Calculamos el coseno de similitud de los vectores anteriores
cosine_similarity(x, y)

array([[0.1118034]])

	Página	Párrafos
0	1	dune
1	1	beginning time taking delicate care balances c...
2	1	manual muad dib princess irulan
3	1	week departure arrakis final scurrying reached...
4	1	warm night castle caladan ancient pile stone s...
...	...	...
7602	591	part publication may reproduced stored retriev...
7603	591	means without prior permission writing publish...
7604	591	circulated form binding cover published withou...
7605	591	condition including condition imposed subseque...
7606	591	www orionbooks co uk

	Página	Párrafos	Senteces_in_tokens
0	1	dune	[dune]
1	1	beginning time taking delicate care balances c...	[beginning, time, taking, delicate, care, bala...
2	1	manual muad dib princess irulan	[manual, muad, dib, princess, irulan]
3	1	week departure arrakis final scurrying reached...	[week, departure, arrakis, final, scurrying, r...
4	1	warm night castle caladan ancient pile stone s...	[warm, night, castle, caladan, ancient, pile, ...
...	...	...	...
7602	591	part publication may reproduced stored retriev...	[part, publication, may, reproduced, stored, r...
7603	591	means without prior permission writing publish...	[means, without, prior, permission, writing, p...
7604	591	circulated form binding cover published withou...	[circulated, form, binding, cover, published, ...
7605	591	condition including condition imposed subseque...	[condition, including, condition, imposed, sub...
7606	591	www orionbooks co uk	[www, orionbooks, co, uk]

	Página	Párrafos	Senteces_in_tokens	Longitud
0	1	dune	[dune]	1
1	1	beginning time taking delicate care balances c...	[beginning, time, taking, delicate, care, bala...	52
2	1	manual muad dib princess irulan	[manual, muad, dib, princess, irulan]	5
3	1	week departure arrakis final scurrying reached...	[week, departure, arrakis, final, scurrying, r...	16
4	1	warm night castle caladan ancient pile stone s...	[warm, night, castle, caladan, ancient, pile, ...	21
...	...	...	...	...
7602	591	part publication may reproduced stored retriev...	[part, publication, may, reproduced, stored, r...	9
7603	591	means without prior permission writing publish...	[means, without, prior, permission, writing, p...	7
7604	591	circulated form binding cover published withou...	[circulated, form, binding, cover, published, ...	7
7605	591	condition including condition imposed subseque...	[condition, including, condition, imposed, sub...	6
7606	591	www orionbooks co uk	[www, orionbooks, co, uk]	4

	Página	Párrafos	Senteces_in_tokens	Longitud	CountVectorizer
0	1	dune	[dune]	1	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1	1	beginning time taking delicate care balances c...	[beginning, time, taking, delicate, care, bala...	52	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2	1	manual muad dib princess irulan	[manual, muad, dib, princess, irulan]	5	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3	1	week departure arrakis final scurrying reached...	[week, departure, arrakis, final, scurrying, r...	16	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4	1	warm night castle caladan ancient pile stone s...	[warm, night, castle, caladan, ancient, pile, ...	21	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
...	...	...	...	...	...
7602	591	part publication may reproduced stored retriev...	[part, publication, may, reproduced, stored, r...	9	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7603	591	means without prior permission writing publish...	[means, without, prior, permission, writing, p...	7	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7604	591	circulated form binding cover published withou...	[circulated, form, binding, cover, published, ...	7	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7605	591	condition including condition imposed subseque...	[condition, including, condition, imposed, sub...	6	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7606	591	www orionbooks co uk	[www, orionbooks, co, uk]	4	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

Procesamiento de lenguaje natural¶

Más sobre preprocesamiento ¶

Algunas técnicas de vectorización ¶

	Página	Párrafos	Senteces_in_tokens	Longitud
3	1	week departure arrakis final scurrying reached...	[week, departure, arrakis, final, scurrying, r...	16
27	2	popular man arouses jealousy powerful hawat sa...	[popular, man, arouses, jealousy, powerful, ha...	35
22	2	reverence way called mother jessica like commo...	[reverence, way, called, mother, jessica, like...	18
16	2	within shadows bed paul held eyes open mere sl...	[within, shadows, bed, paul, held, eyes, open,...	20
18	2	need faculties meet gom jabbar	[need, faculties, meet, gom, jabbar]	5
...	...	...	...	...
7575	590	basis latitude meridian observatory mountain b...	[basis, latitude, meridian, observatory, mount...	15
7578	590	harg pass shrine leto skull overlooks pass old...	[harg, pass, shrine, leto, skull, overlooks, p...	15
7584	590	wind pass cliff walled opens sink villages wor...	[wind, pass, cliff, walled, opens, sink, villa...	13
7582	590	red chasm 1582 bled level rimwall west high sc...	[red, chasm, 1582, bled, level, rimwall, west,...	13
7597	591	right frank herbert identified author work ass...	[right, frank, herbert, identified, author, wo...	7