import xarray as xa
import pandas as pd
import sklearn.decomposition as skdec
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from pyhdf.SD import SD, SDC, SDim
import os
import numpy as np
import pyproj
import geopandas as gp
import sklearn.linear_model as sklm
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xb
munis = gp.read_file("../../DATOS/GEO GT/gt-munis-lowq-2017_splits_from_2011.geojson")
deptos = gp.read_file("../../DATOS/GEO GT/gt-deptos-lowq-2017.geojson")
deptos = deptos[deptos.CODIGO!="2300"]
deptos.plot()
def readHDF(filename, verbose=False):
_file = SD(filename, SDC.READ)
_sds_objs = [
_file.select('500m 16 days EVI'),
_file.select('500m 16 days red reflectance'),
_file.select('500m 16 days NIR reflectance'),
_file.select('500m 16 days blue reflectance'),
_file.select('500m 16 days MIR reflectance')
]
_datas = [x.get() for x in _sds_objs]
if verbose:
print("file: ", filename)
print("file.info() = ", _file.info())
print("data.shape = ", _datas[0].shape)
_file.end()
return np.array(_datas)
modVegs = []
for f in os.listdir("../../DATOS/MODIS/veg16d"):
if f.endswith(".hdf") and "2003" in f:
modVegs.append(readHDF("../../DATOS/MODIS/veg16d/" + f, True))
modVegs = np.array(modVegs)
modVegs.shape
La primer dimensión es el tiempo (períodos de 16 días). La segunda dimensión es de las distintas capas de datos, y las otras dos son longitud y latitud.
# Datasets de distintas fechas deben ser distintos:
np.sum(modVegs[20,:,:,:] - modVegs[5,:,:,:])
sinmodis = pyproj.Proj("+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs")
sinupper = (-10007554.677000,2223901.039333)
sinlower = (-8895604.157333,1111950.519667)
upper = sinmodis(sinupper[0], sinupper[1], inverse=True)
lower = sinmodis(sinlower[0], sinlower[1], inverse=True)
print("Upper bound: ", upper)
print("Lower bound: ", lower)
sina = np.linspace(sinupper[0], sinlower[0], 2400)
sinb = np.linspace(sinupper[1], sinlower[1], 2400)
sinspa, sinspb = np.meshgrid(sina, sinb)
lons, lats = sinmodis(sinspa.flatten(), sinspb.flatten(), inverse=True)
lons = lons.reshape((2400,2400))
lats = lats.reshape((2400,2400))
# En esta ventana está Guatemala
plt.imshow(modVegs[0,0,280:1500,40:1400])
modVegs_B = modVegs[:,:,280:1500,40:1400]
modVegs_C = modVegs_B.reshape((-1,modVegs_B.shape[-2], modVegs_B.shape[-1]))
modVegs_C.shape
# Aplanar:
modVegs_D = modVegs_C.reshape((115, -1))
latsD = lats[280:1500,40:1400].reshape((-1,))
lonsD = lons[280:1500,40:1400].reshape((-1,))
latsD.shape, modVegs_D.shape
import rasterio as rst
cobveg03 = rst.open("../../DATOS/SEGEPLAN Otros/cobertura_vegetal_uso_tierra_2003_layers.tif")
descs = cobveg03.descriptions
descs = pd.Series(descs)
cobveg03.crs
print("Primeras veinte capas: ", ", ".join(descs[0:20]) + "...")
cobveglayers = []
for i in range(1,len(descs)+1):
cobveglayers.append(cobveg03.read(i))
cobveglayers = np.array(cobveglayers)
ax = plt.subplot(111)
ax.imshow(cobveglayers[24,:,:], origin="bottom", cmap="Blues", extent=(cobveg03.bounds.left, cobveg03.bounds.right, cobveg03.bounds.top, cobveg03.bounds.bottom))
deptos.plot(edgecolor="#888888", color="#ffffff00", ax=ax)
plt.title("Azúcar")
%time cobvegXs,cobvegYs = cobveg03.index(lonsD, latsD)
cobveg03.shape
cobvegXs = np.array(cobvegXs)
cobvegYs = np.array(cobvegYs)
# escogiendo sólo los datos válidos, contenidos en el raster
validData = (np.isnan(cobvegXs)==False) & (np.isnan(cobvegYs)==False) & (cobvegXs> 0) & (cobvegYs>0) & \
(cobvegXs < cobveg03.shape[0]) & (cobvegYs < cobveg03.shape[1])
latsD = latsD[validData]
lonsD = lonsD[validData]
modVegs_D = modVegs_D[:, validData]
cobvegXs = cobvegXs[validData]
cobvegYs = cobvegYs[validData]
cobvegXs.shape
targets = cobveglayers[:, cobvegXs, cobvegYs]
targets.shape
pd.options.display.max_rows=100
descripciones = targets.sum(axis=1)
descripciones = pd.Series(descripciones, index=descs)
descripciones
targets_masde100puntos = np.argwhere(descripciones>100).T
plt.rcParams["figure.figsize"] = (6,6)
ax = plt.subplot(111)
ax.scatter(lonsD[targets[24,:]==1], latsD[targets[24,:]==1], s= 0.01)
deptos.plot(color="#ffffff00", edgecolor="#888888", ax=ax)
plt.title("Caña de azúcar - de acuerdo a post-procesado")
Visualizando 5 tipos de vegetación distinta en 5 bandas distintas:
mpl.colors.Colormap("tab10")
plt.rcParams["figure.figsize"] = (10,10)
for j, componente in enumerate(["EVI", "red", "NIR", "blue", "MIR"]):
plt.subplot(5,1,j+1)
plt.title("Componente " + componente)
for i, df in enumerate([24,43,12,9]):
if i==10: continue
means = modVegs_D[j:115:5, (targets[df,:]==1) & (modVegs_D.min(axis=0)>0)] .mean(axis=1)
stds = modVegs_D[j:115:5, (targets[df,:]==1) & (modVegs_D.min(axis=0)>0)].std(axis=1)
plt.plot(means, label = descs[df], alpha=0.5, c=mpl.cm.tab10.colors[i])
plt.fill_between(range(0,len(means)), means - stds/1, means + stds/1, alpha=0.1, color=mpl.cm.tab10.colors[i])
plt.xticks(rotation=90)
if j==0:
plt.legend(loc=(0,1))
plt.tight_layout()
0
from sklearn.decomposition import PCA
pcamodel = PCA()
modVegs_D.shape
pcafit = pcamodel.fit(modVegs_D.T[:,::5])
modVegs_D[:, (targets[24,:]==1)][:,0]
plt.rcParams["figure.figsize"] = (10,4)
plt.plot(pcafit.components_[0])
pcafit.explained_variance_ratio_
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[12,:]==1)][::5,0:1000].T), 0.5,axis=0))
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[24,:]==1)][::5,0:1000].T), 0.5,axis=0))
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[43,:]==1)][::5,0:1000].T), 0.5,axis=0))
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[27,:]==1)][::5,0:1000].T), 0.5,axis=0))
None
data_train, data_test, target_train, target_test = train_test_split(modVegs_D.T, targets.T,
test_size=10000, train_size=200000, random_state=1)
data_train, target_train
data_test.shape, target_test.shape, data_train.shape, target_train.shape
from sklearn.neighbors import KNeighborsClassifier
classifierKN = KNeighborsClassifier(n_neighbors=5, n_jobs=4)
classifierKN.fit(data_train, target_train[:,24])
testsize = 10000
target_pred = classifierKN.predict(data_test[0:testsize])
print(metrics.classification_report(target_test[0:testsize,24], target_pred[0:testsize])) #, target_names=descs))
# Sólo con el EVI se obtuvo esto:
# precision recall f1-score support
# 25 Cana de azucar 0.87 0.52 0.65 811
# Usando las 5 componentes (EVI, rojo, NIR, azul, MIR) se obtiene:
# 0.90 0.74 0.81 140
from sklearn.tree import DecisionTreeClassifier
classifierDT = DecisionTreeClassifier(criterion="entropy")
classifierDT.fit(data_train, target_train[:, 24])
testsize = 10000
target_pred = classifierDT.predict(data_test[0:testsize])
accuracy = metrics.accuracy_score(target_test[0:testsize, 24], target_pred)
accuracy
print(metrics.classification.classification_report(target_test[0:testsize, 24], target_pred[0:testsize])) #, target_names=descs))
# EVI
# 25 Cana de azucar 0.49 0.48 0.49 1683
# EVI y componentes
# 0.68 0.71 0.69 140
from sklearn.ensemble import RandomForestClassifier
# Decision trees es más rápido que KN, podemos utilizar un mayor tamño de muestra
data_train, data_test, target_train, target_test = train_test_split(pxdatadf.iloc[:, dias], pxdatadf.iloc[:, ys],
test_size=100000, train_size=500000, random_state=1)
classifierRF = RandomForestClassifier(n_jobs=4)
classifierRF.fit(data_train, target_train)
testsize = 100000
target_pred = classifierRF.predict(data_test[0:testsize])
accuracy = metrics.accuracy_score(target_test[0:testsize], target_pred)
accuracy
print(metrics.classification.classification_report(target_test[0:testsize], target_pred[0:testsize], target_names=descs))
del classifierRF
import sklearn.naive_bayes as nb
clasifNB = nb.GaussianNB()
clasifNB.fit(data_train, target_train[:,24])
testsize = 10000
target_pred = clasifNB.predict(data_test[0:testsize])
accuracy = metrics.accuracy_score(target_test[0:testsize, 24], target_pred[:])
accuracy
print(metrics.classification.classification_report(target_test[0:testsize, 24], target_pred[0:testsize]))
data_train, data_test, target_train, target_test = train_test_split(modVegs_D.T, targets.T,
test_size=0.5, train_size=0.3, random_state=2)
data_train.shape, data_test.shape
# SCORE: 0.85 0.84 0.85 (precision, recall, f1)
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=9, min_child_weight=5, eval_metric="auc",
# subsample=0.95, n_estimators=200, scale_pos_weight= 0.1 * 9000/140)
# SCORE: 0.90 0.87 0.89 ---- 0.90 0.87 0.89 (Con todos los datos)
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc",
# subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
# SCORE: 0.88 0.86 0.87
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc", base_score=0.1,
# subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
# SCORE: 0.90 0.83 0.86
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc",
# subsample=0.8, n_estimators=400, scale_pos_weight= 0.1 * 9000/140, )
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc", tree_method="gpu_hist",
subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
fit = clasifXGB.fit(data_train, target_train[:,24])
plt.rcParams["figure.figsize"] = (6,3)
plt.plot(fit.feature_importances_.reshape((5,-1)).T, marker="o")
#plt.plot(fit.feature_importances_)
plt.legend(labels=range(0,5))
0
Estos modelos muestran que las quincenas que más información aportan para la identificación de la caña de azúcar son las de Marzo/Abril y Noviembre/Diciembre. Esto coincide con el final e inicio, respectivamente, de la cosecha de la caña de azúcar.
testsize = 200000
target_pred = clasifXGB.predict(data_test[0:testsize],)
target_test[400:500,24], target_pred[400:500]
accuracy = metrics.accuracy_score(target_test[0:testsize, 24], target_pred[:])
accuracy
print(metrics.classification.classification_report(target_test[0:testsize, 24], target_pred[0:testsize]))
# Borré el resultado anterior para el uso de EVI, pero era algo como 0.81, 0.65 , 0.7
data_train, data_test, target_train, target_test = train_test_split(modVegs_D.T, targets.T,
test_size=100000, train_size=200000, random_state=2)
import tensorflow as tf
data_augmented.shape
normmean= np.mean(data_train)
tfmodel = tf.keras.models.Sequential([
tf.keras.layers.Dense(32, activation="relu",
input_shape=[data_train.shape[1]],
kernel_initializer = tf.keras.initializers.RandomNormal(),
bias_initializer = tf.keras.initializers.RandomNormal()),
tf.keras.layers.Dense(32, activation="relu",
kernel_initializer = tf.keras.initializers.RandomNormal(),
bias_initializer = tf.keras.initializers.RandomNormal()),
tf.keras.layers.Dense(32, activation="relu",
kernel_initializer = tf.keras.initializers.RandomNormal(),
bias_initializer = tf.keras.initializers.RandomNormal()),
tf.keras.layers.Dense(32, activation="relu",
kernel_initializer = tf.keras.initializers.RandomNormal(),
bias_initializer = tf.keras.initializers.RandomNormal()),
tf.keras.layers.Dense(1, activation="sigmoid")
])
tfloss = "binary_crossentropy"
tfopt = tf.keras.optimizers.SGD(0.1)
tfmodel.compile(optimizer=tfopt,
loss=tfloss )
tfmodel.summary()
tfmodel.optimizer.lr.assign(0.01)
history = tfmodel.fit(data_train/normmean, target_train[:,24] , validation_split=0.2, batch_size=500, epochs=200, shuffle=True)
plt.rcParams["figure.figsize"] = (4,4)
plt.plot(history.history["val_loss"], label="validation loss")
plt.plot(history.history["loss"], label = "loss")
plt.legend()
None