import pandas as pd
import numpy as np

# Cargamos los datos
data_df = pd.read_csv('kc_house_data.csv')
data_df


# Convertimos los datos en un array
data_array = np.array(data_df)
data_array

array([[7129300520, '20141013T000000', 221900.0, ..., -122.257, 1340,
        5650],
       [6414100192, '20141209T000000', 538000.0, ..., -122.319, 1690,
        7639],
       [5631500400, '20150225T000000', 180000.0, ..., -122.233, 2720,
        8062],
       ...,
       [1523300141, '20140623T000000', 402101.0, ..., -122.299, 1020,
        2007],
       [291310100, '20150116T000000', 400000.0, ..., -122.069, 1410,
        1287],
       [1523300157, '20141015T000000', 325000.0, ..., -122.299, 1020,
        1357]], dtype=object)


# Mediante numpy
price_np = np.array(data_df.price, np.float32)
waterfront_np = np.array(data_df.waterfront, np.bool)
waterfront_np

C:\Users\usuario\AppData\Local\Temp\ipykernel_11444\1061770669.py:3: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  waterfront_np = np.array(data_df.waterfront, np.bool)

array([False, False, False, ..., False, False, False])


# Mediante tensorflow
import tensorflow as tf
price_tf = tf.cast(data_df.price, tf.float32)
waterfront_tf = tf.cast(data_df.waterfront, tf.bool)
price_tf

<tf.Tensor: shape=(21613,), dtype=float32, numpy=
array([221900., 538000., 180000., ..., 402101., 400000., 325000.],
      dtype=float32)>

# Calculamos la perdida MSE
loss = tf.keras.losses.mse(targets, predictions)

# Definicion de un modelo de regresion lineal
def linear_regression(intercept, slope = slope, features = features):
    # Devuelve las predicciones del modelo
    return intercept + features * slope


data_df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')
sns.scatterplot(x='sqft_lot15', y='price', data=data_df)

# Graficaremos en una escala logaritmica, lo cual es util cuando sospechamos
# que la relacion es proporcional
plt.xscale('log')
plt.yscale('log')
plt.ylabel('log(price)')
plt.xlabel('log(size)')
plt.show()


# Agregamos dos columnas nuevas con los logaritmos del tamagnio y los precios
data_df['sqft_lot15_log'] = data_df.sqft_lot15.apply(lambda x: np.log(x))
data_df['price_log'] = data_df.price.apply(lambda x: np.log(x))
sns.regplot(x = 'sqft_lot15_log', 
           y = 'price_log',  
           ci = None, 
           marker='.',
           data = data_df) 
plt.title('Datos con la recta ajustada', size=16)
plt.show()


# Definimos los objetivos y las caracteristicas
price = np.array(data_df.price, np.float32)
size = np.array(data_df.sqft_lot15, np.float32)

# Inicializamos el intercepto y la pendiente
# como variables entrenables 
intercepto = tf.Variable(0.1, np.float32)
pendiente = tf.Variable(0.1, np.float32)


# Definimos un modelo de regresion lineal
def linear_regression(intercept, slope, features=size):
    return intercept + features * slope


def loss_function(intercept, slope, targets=price, features=size):
    predictions = linear_regression(intercept, slope)
    return tf.keras.losses.mse(targets, predictions)


# Podriamos pasar un valor de parametro al optimizador, lo cual
# significa la tasa de aprendizaje
op = tf.keras.optimizers.Adam()


# Minimizamos la funcion de perdida
# Ejecutaremos 1000 iteraciones
results = []
for j in range(1000):
    #                   funcion a minimizar               lista de variables 
    op.minimize(lambda: loss_function(intercepto, pendiente), var_list=[intercepto, pendiente])
    results.append(loss_function(intercepto, pendiente))


# veamos los ultimos valores obtenidos
results[950:]

[<tf.Tensor: shape=(), dtype=float32, numpy=389806300000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389797280000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389788240000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389779230000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389770200000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389761200000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389752200000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389743180000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389734170000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389725160000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389716180000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389707170000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389698160000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389689180000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389680140000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389671200000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389662200000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389653200000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389644200000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389635200000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389626230000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389617250000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389608280000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389599260000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389590300000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389581340000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389572360000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389563400000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389554370000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389545430000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389536450000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389527470000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389518520000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389509550000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389500600000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389491620000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389482680000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389473700000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389464750000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389455800000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389446860000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389437900000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389428900000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389419960000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389411050000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389402100000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389393150000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389384240000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389375260000.0>,
 <tf.Tensor: shape=(), dtype=float32, numpy=389366380000.0>]


# Imprimimos los parametos entrenados
print(intercepto.numpy())
print(pendiente.numpy())

3.0649319
2.8952818


# Definimos los objetivos y las caracteristicas
price = np.array(data_df.price_log, np.float32)
size = np.array(data_df.sqft_lot15_log, np.float32)
bedrooms = np.array(data_df.bedrooms, np.float32)

# Inicializamos los parametros
# como variables entrenables 
beta0 = tf.Variable(0.1, np.float32)
beta1 = tf.Variable(0.1, np.float32)
beta2 = tf.Variable(0.1, np.float32)

# Definimos un modelo de regresion lineal multiple
def linear_regression(params, feature1=size, feature2 = bedrooms):
    return params[0] + feature1 * params[1] + feature2 * params[2]

def loss_function(params, targets=price, feature1=size, feature2 = bedrooms):
    predictions = linear_regression(params, feature1, feature2)
    return tf.keras.losses.mse(targets, predictions)

# Optimizador Adam
op = tf.keras.optimizers.Adam()

# Minimizamos la funcion de perdida
# Ejecutaremos 1000 iteraciones
results = []
for j in range(1000):
    op.minimize(lambda: loss_function(params=(beta0, beta1, beta2)),
                var_list=[beta0, beta1, beta2])
    results.append(loss_function(params=(beta0, beta1, beta2)))


results[950:]

[<tf.Tensor: shape=(), dtype=float32, numpy=7.9905834>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.9624834>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.934484>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.90659>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.878797>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.8511066>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.823517>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.796027>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.768639>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.7413507>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.714161>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.687072>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.6600847>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.633197>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.6064067>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.579716>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.5531235>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.526629>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.500232>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.4739323>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.4477296>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.4216247>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.395617>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.369704>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.3438873>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.3181677>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.292544>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.2670174>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.2415857>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.2162495>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.191006>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.165859>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.1408057>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.1158476>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.0909796>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.0662065>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.041527>,
 <tf.Tensor: shape=(), dtype=float32, numpy=7.016939>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.9924407>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.9680367>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.943723>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.919502>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.895371>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.871333>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.847385>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.823528>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.799761>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.776084>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.752497>,
 <tf.Tensor: shape=(), dtype=float32, numpy=6.729>]


# Imprimimos los parametos entrenados
print(beta0.numpy())
print(beta1.numpy())
print(beta2.numpy())

0.8046444
0.8013589
0.79487073


# Cargamos los datos mediante lotes de tamagnio 10000,
# el resultado sera un iterador
df = pd.read_csv('kc_house_data.csv', chunksize=10000)
# Veamos la forma de cada lote
for lote in df:
    print(lote.shape)

(10000, 21)
(10000, 21)
(1613, 21)


# Forma del dataframe completo
data_df.shape

(21613, 23)


# Cargamos los datos mediante lotes:
# Creamos dos listas auxiliares para guardar la informacion de los lotes
price_list = []
size_list = []
# Iteramos sobre los datos cargados por lotes
for lote in pd.read_csv('kc_house_data.csv', chunksize=100):
    # Extraeremos la columna price de la informacion que obtengamos 
    # de cada lote y la agregamos a la lista price_list
    price_list.append(lote['price'])
    
    # analogo para el tamagnio
    size_list.append(lote['sqft_lot15'])
# Unimos todos los lotes
price = pd.concat(price_list)
size = pd.concat(size_list)


# Carga por lotes
price

0        221900.0
1        538000.0
2        180000.0
3        604000.0
4        510000.0
           ...   
21608    360000.0
21609    400000.0
21610    402101.0
21611    400000.0
21612    325000.0
Name: price, Length: 21613, dtype: float64


# Comparacion de la info de la carga por lotes y
# la carga normal
data_df.price == price

0        True
1        True
2        True
3        True
4        True
         ... 
21608    True
21609    True
21610    True
21611    True
21612    True
Name: price, Length: 21613, dtype: bool


price_np = np.array(price, np.float32)
size_np = np.array(size, np.float32)


# Definimos las variables a utilizar el un modelo de regresion 
# lineal simple
intercepto = tf.Variable(0.1, tf.float32)
pendiente = tf.Variable(0.1, tf.float32)

# Definimos un modelo de regresion lineal multiple
def linear_regression(intercept, slope, features):
    return intercept + features * slope
# Definimos la funcion de perdida
def loss_function(intercept, slope, targets, features):
    predictions = linear_regression(intercept, slope, features)
    return tf.keras.losses.mse(targets, predictions)

# Optimizador Adam
op = tf.keras.optimizers.Adam()


# Realizamos la carga de los datos por lotes de tamagnio 100
df = pd.read_csv('kc_house_data.csv', chunksize=100)
# Iteramos sobre cada lote en df
for lote in df:
    # Extraemos las columnas de caracteristicas y objetivos
    # de cada lote
    price_lote = np.array(lote['price'], np.float32)
    size_lote = np.array(lote['sqft_lot15'], np.float32)
    # Minimizamos la funcion de perdida
    op.minimize(lambda: loss_function(intercepto, pendiente, price_lote, size_lote),
                var_list=[intercepto, pendiente])


print(intercepto.numpy(), pendiente.numpy())

0.31784675 0.30197635

Redes neuronales¶

Clase7: Modelos lineales¶

Contenido¶

Función de pérdida ¶

Regresión lineal ¶

Entrenamiento por lotes ¶

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	...	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	0	0	...	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	0	0	...	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	0	0	...	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	0	0	...	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	0	0	...	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
21608	263000018	20140521T000000	360000.0	3	2.50	1530	1131	3.0	0	0	...	8	1530	0	2009	0	98103	47.6993	-122.346	1530	1509
21609	6600060120	20150223T000000	400000.0	4	2.50	2310	5813	2.0	0	0	...	8	2310	0	2014	0	98146	47.5107	-122.362	1830	7200
21610	1523300141	20140623T000000	402101.0	2	0.75	1020	1350	2.0	0	0	...	7	1020	0	2009	0	98144	47.5944	-122.299	1020	2007
21611	291310100	20150116T000000	400000.0	3	2.50	1600	2388	2.0	0	0	...	8	1600	0	2004	0	98027	47.5345	-122.069	1410	1287
21612	1523300157	20141015T000000	325000.0	2	0.75	1020	1076	2.0	0	0	...	7	1020	0	2008	0	98144	47.5941	-122.299	1020	1357