import xarray as xa
import pandas as pd
import sklearn.decomposition as skdec
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from pyhdf.SD import SD, SDC, SDim
import os
import numpy as np
import pyproj
import geopandas as gp
import sklearn.linear_model as sklm
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xb
munis = gp.read_file("../../DATOS/GEO GT/gt-munis-lowq-2017_splits_from_2011.geojson")
deptos = gp.read_file("../../DATOS/GEO GT/gt-deptos-lowq-2017.geojson")
deptos = deptos[deptos.CODIGO!="2300"]
deptos.plot()
def readHDF(filename, verbose=False):
_file = SD(filename, SDC.READ)
_sds_objs = [
_file.select('500m 16 days EVI'),
_file.select('500m 16 days red reflectance'),
_file.select('500m 16 days NIR reflectance'),
_file.select('500m 16 days blue reflectance'),
_file.select('500m 16 days MIR reflectance')
]
_datas = [x.get() for x in _sds_objs]
if verbose:
print("file: ", filename)
print("file.info() = ", _file.info())
print("data.shape = ", _datas[0].shape)
_file.end()
return np.array(_datas)
modVegs = []
for f in os.listdir("../../DATOS/MODIS/veg16d"):
if f.endswith(".hdf") and "2003" in f:
modVegs.append(readHDF("../../DATOS/MODIS/veg16d/" + f, True))
modVegs = np.array(modVegs)
modVegs.shape
La primer dimensión es el tiempo (períodos de 16 días). La segunda dimensión es de las distintas capas de datos, y las otras dos son longitud y latitud.
# Datasets de distintas fechas deben ser distintos:
np.sum(modVegs[20,:,:,:] - modVegs[5,:,:,:])
sinmodis = pyproj.Proj("+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs")
sinupper = (-10007554.677000,2223901.039333)
sinlower = (-8895604.157333,1111950.519667)
upper = sinmodis(sinupper[0], sinupper[1], inverse=True)
lower = sinmodis(sinlower[0], sinlower[1], inverse=True)
print("Upper bound: ", upper)
print("Lower bound: ", lower)
sina = np.linspace(sinupper[0], sinlower[0], 2400)
sinb = np.linspace(sinupper[1], sinlower[1], 2400)
sinspa, sinspb = np.meshgrid(sina, sinb)
lons, lats = sinmodis(sinspa.flatten(), sinspb.flatten(), inverse=True)
lons = lons.reshape((2400,2400))
lats = lats.reshape((2400,2400))
# En esta ventana está Guatemala
plt.imshow(modVegs[0,0,280:1500,40:1400])
modVegs_B = modVegs[:,:,280:1500,40:1400]
modVegs_C = modVegs_B.reshape((-1,modVegs_B.shape[-2], modVegs_B.shape[-1]))
modVegs_C.shape
# Aplanar:
modVegs_D = modVegs_C.reshape((115, -1))
latsD = lats[280:1500,40:1400].reshape((-1,))
lonsD = lons[280:1500,40:1400].reshape((-1,))
latsD.shape, modVegs_D.shape
import rasterio as rst
cobveg03 = rst.open("../../DATOS/SEGEPLAN Otros/cobertura_vegetal_uso_tierra_2003_layers.tif")
descs = cobveg03.descriptions
descs = pd.Series(descs)
cobveg03.crs
print("Primeras veinte capas: ", ", ".join(descs[0:20]) + "...")
cobveglayers = []
for i in range(1,len(descs)+1):
cobveglayers.append(cobveg03.read(i))
cobveglayers = np.array(cobveglayers)
ax = plt.subplot(111)
ax.imshow(cobveglayers[24,:,:], origin="bottom", cmap="Blues", extent=(cobveg03.bounds.left, cobveg03.bounds.right, cobveg03.bounds.top, cobveg03.bounds.bottom))
deptos.plot(edgecolor="#888888", color="#ffffff00", ax=ax)
plt.title("Azúcar")
%time cobvegXs,cobvegYs = cobveg03.index(lonsD, latsD)
cobveg03.shape
cobvegXs = np.array(cobvegXs)
cobvegYs = np.array(cobvegYs)
# escogiendo sólo los datos válidos, contenidos en el raster
validData = (np.isnan(cobvegXs)==False) & (np.isnan(cobvegYs)==False) & (cobvegXs> 0) & (cobvegYs>0) & \
(cobvegXs < cobveg03.shape[0]) & (cobvegYs < cobveg03.shape[1])
latsD = latsD[validData]
lonsD = lonsD[validData]
modVegs_D = modVegs_D[:, validData]
cobvegXs = cobvegXs[validData]
cobvegYs = cobvegYs[validData]
cobvegXs.shape
targets = cobveglayers[:, cobvegXs, cobvegYs]
targets.shape
pd.options.display.max_rows=100
descripciones = targets.sum(axis=1)
descripciones = pd.Series(descripciones, index=descs)
descripciones
targets_masde100puntos = np.argwhere(descripciones>100).T
plt.rcParams["figure.figsize"] = (6,6)
ax = plt.subplot(111)
ax.scatter(lonsD[targets[24,:]==1], latsD[targets[24,:]==1], s= 0.01)
deptos.plot(color="#ffffff00", edgecolor="#888888", ax=ax)
plt.title("Caña de azúcar - de acuerdo a post-procesado")
Visualizando 5 tipos de vegetación distinta en 5 bandas distintas:
mpl.colors.Colormap("tab10")
plt.rcParams["figure.figsize"] = (10,10)
for j, componente in enumerate(["EVI", "red", "NIR", "blue", "MIR"]):
plt.subplot(5,1,j+1)
plt.title("Componente " + componente)
for i, df in enumerate([24,43,12,9]):
if i==10: continue
means = modVegs_D[j:115:5, (targets[df,:]==1) & (modVegs_D.min(axis=0)>0)] .mean(axis=1)
stds = modVegs_D[j:115:5, (targets[df,:]==1) & (modVegs_D.min(axis=0)>0)].std(axis=1)
plt.plot(means, label = descs[df], alpha=0.5, c=mpl.cm.tab10.colors[i])
plt.fill_between(range(0,len(means)), means - stds/1, means + stds/1, alpha=0.1, color=mpl.cm.tab10.colors[i])
plt.xticks(rotation=90)
if j==0:
plt.legend(loc=(0,1))
plt.tight_layout()
0
from sklearn.decomposition import PCA
pcamodel = PCA()
modVegs_D.shape
pcafit = pcamodel.fit(modVegs_D.T[:,::5])
modVegs_D[:, (targets[24,:]==1)][:,0]
plt.rcParams["figure.figsize"] = (10,4)
plt.plot(pcafit.components_[0])
pcafit.explained_variance_ratio_
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[12,:]==1)][::5,0:1000].T), 0.5,axis=0))
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[24,:]==1)][::5,0:1000].T), 0.5,axis=0))
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[43,:]==1)][::5,0:1000].T), 0.5,axis=0))
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[27,:]==1)][::5,0:1000].T), 0.5,axis=0))
None
data_train, data_test, target_train, target_test = train_test_split(modVegs_D.T, targets.T,
test_size=10000, train_size=200000, random_state=1)
data_train, target_train
data_test.shape, target_test.shape, data_train.shape, target_train.shape
from sklearn.neighbors import KNeighborsClassifier
classifierKN = KNeighborsClassifier(n_neighbors=5, n_jobs=4)
classifierKN.fit(data_train, target_train[:,24])
testsize = 10000
target_pred = classifierKN.predict(data_test[0:testsize])
print(metrics.classification_report(target_test[0:testsize,24], target_pred[0:testsize])) #, target_names=descs))
# Sólo con el EVI se obtuvo esto:
# precision recall f1-score support
# 25 Cana de azucar 0.87 0.52 0.65 811
# Usando las 5 componentes (EVI, rojo, NIR, azul, MIR) se obtiene:
# 0.90 0.74 0.81 140
from sklearn.tree import DecisionTreeClassifier
classifierDT = DecisionTreeClassifier(criterion="entropy")
classifierDT.fit(data_train, target_train[:, 24])
testsize = 10000
target_pred = classifierDT.predict(data_test[0:testsize])
accuracy = metrics.accuracy_score(target_test[0:testsize, 24], target_pred)
accuracy
print(metrics.classification.classification_report(target_test[0:testsize, 24], target_pred[0:testsize])) #, target_names=descs))
# EVI
# 25 Cana de azucar 0.49 0.48 0.49 1683
# EVI y componentes
# 0.68 0.71 0.69 140
from sklearn.ensemble import RandomForestClassifier
# Decision trees es más rápido que KN, podemos utilizar un mayor tamño de muestra
data_train, data_test, target_train, target_test = train_test_split(pxdatadf.iloc[:, dias], pxdatadf.iloc[:, ys],
test_size=100000, train_size=500000, random_state=1)
classifierRF = RandomForestClassifier(n_jobs=4)
classifierRF.fit(data_train, target_train)
testsize = 100000
target_pred = classifierRF.predict(data_test[0:testsize])
accuracy = metrics.accuracy_score(target_test[0:testsize], target_pred)
accuracy
print(metrics.classification.classification_report(target_test[0:testsize], target_pred[0:testsize], target_names=descs))
del classifierRF
import sklearn.naive_bayes as nb
clasifNB = nb.GaussianNB()
clasifNB.fit(data_train, target_train[:,24])
testsize = 10000
target_pred = clasifNB.predict(data_test[0:testsize])
accuracy = metrics.accuracy_score(target_test[0:testsize, 24], target_pred[:])
accuracy
print(metrics.classification.classification_report(target_test[0:testsize, 24], target_pred[0:testsize]))
data_train, data_test, target_train, target_test = train_test_split(modVegs_D.T, targets.T,
test_size=0.5, train_size=0.3, random_state=2)
data_train.shape, data_test.shape
# SCORE: 0.85 0.84 0.85 (precision, recall, f1)
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=9, min_child_weight=5, eval_metric="auc",
# subsample=0.95, n_estimators=200, scale_pos_weight= 0.1 * 9000/140)
# SCORE: 0.90 0.87 0.89 ---- 0.90 0.87 0.89 (Con todos los datos)
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc",
# subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
# SCORE: 0.88 0.86 0.87
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc", base_score=0.1,
# subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
# SCORE: 0.90 0.83 0.86
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc",
# subsample=0.8, n_estimators=400, scale_pos_weight= 0.1 * 9000/140, )
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc", tree_method="gpu_hist",
subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
fit = clasifXGB.fit(data_train, target_train[:,24])
plt.rcParams["figure.figsize"] = (6,3)
plt.plot(fit.feature_importances_.reshape((5,-1)).T, marker="o")
#plt.plot(fit.feature_importances_)
plt.legend(labels=range(0,5))
0
Estos modelos muestran que las quincenas que más información aportan para la identificación de la caña de azúcar son las de Marzo/Abril y Noviembre/Diciembre. Esto coincide con el final e inicio, respectivamente, de la cosecha de la caña de azúcar.
testsize = 200000
target_pred = clasifXGB.predict(data_test[0:testsize],)
target_test[400:500,24], target_pred[400:500]
accuracy = metrics.accuracy_score(target_test[0:testsize, 24], target_pred[:])
accuracy
print(metrics.classification.classification_report(target_test[0:testsize, 24], target_pred[0:testsize]))
# Borré el resultado anterior para el uso de EVI, pero era algo como 0.81, 0.65 , 0.7
data_train, data_test, target_train, target_test = train_test_split(modVegs_D.T, targets.T,
test_size=100000, train_size=200000, random_state=2)
import tensorflow as tf
data_augmented.shape
normmean= np.mean(data_train)
tfmodel = tf.keras.models.Sequential([
tf.keras.layers.Dense(32, activation="relu",
input_shape=[data_train.shape[1]],
kernel_initializer = tf.keras.initializers.RandomNormal(),
bias_initializer = tf.keras.initializers.RandomNormal()),
tf.keras.layers.Dense(32, activation="relu",
kernel_initializer = tf.keras.initializers.RandomNormal(),
bias_initializer = tf.keras.initializers.RandomNormal()),
tf.keras.layers.Dense(32, activation="relu",
kernel_initializer = tf.keras.initializers.RandomNormal(),
bias_initializer = tf.keras.initializers.RandomNormal()),
tf.keras.layers.Dense(32, activation="relu",
kernel_initializer = tf.keras.initializers.RandomNormal(),
bias_initializer = tf.keras.initializers.RandomNormal()),
tf.keras.layers.Dense(1, activation="sigmoid")
])
tfloss = "binary_crossentropy"
tfopt = tf.keras.optimizers.SGD(0.1)
tfmodel.compile(optimizer=tfopt,
loss=tfloss )
tfmodel.summary()
tfmodel.optimizer.lr.assign(0.01)
history = tfmodel.fit(data_train/normmean, target_train[:,24] , validation_split=0.2, batch_size=500, epochs=200, shuffle=True)
plt.rcParams["figure.figsize"] = (4,4)
plt.plot(history.history["val_loss"], label="validation loss")
plt.plot(history.history["loss"], label = "loss")
plt.legend()
None
pred_target_test = tfmodel.predict(data_test/normmean)
print(metrics.classification_report(target_test[:,24], np.int16(pred_target_test>0.4), digits=4))
Tras varios intentos, no se ha logrado un modelo que compita con lo obtenido por XGBoost.
import scipy.ndimage as ndimg
modVegs_C.shape
Calculando el promedio del entorno de cada punto:
# Con este kernel no mejora:
# 0.90 0.86 0.88
kernel = np.ones((1, 3, 3))
kernel[:,1,1] = 0
kernel /= 8
kernel
# Este kernel mejora precision y recall
# 0.91 0.91 0.91
kernel = np.ones((1, 5, 5))
kernel[:,2,2] = 0
kernel /= kernel.sum()
kernel
# Igual que el anterior:
# 0.91 0.91 0.91
kernel = np.zeros((1,9,9))
kernel[:,4,4] = 1
kernel = ndimg.gaussian_filter(kernel, sigma=2)
plt.imshow(kernel[0])
modVegs_C_blur = ndimg.convolve(modVegs_C, kernel)
plt.rcParams["figure.figsize"] = (20,10)
plt.subplot(1,2,1)
plt.imshow(modVegs_C[0,:,:])
plt.subplot(1,2,2)
plt.imshow(modVegs_C_blur[0,:,:])
modVegs_D_blur = modVegs_C_blur.reshape((115, -1))
modVegs_D_blur = modVegs_D_blur[:, validData]
data_augmented = np.concatenate([modVegs_D, modVegs_D_blur], axis=0)
data_train, data_test, target_train, target_test = train_test_split(data_augmented.T, targets.T,
test_size=0.5, train_size=0.3, random_state=2)
data_train.shape, data_test.shape
# SCORE:
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=9, min_child_weight=5, eval_metric="auc",
# subsample=0.95, n_estimators=200, scale_pos_weight= 0.1 * 9000/140)
# SCORE:
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc",
# subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
# SCORE:
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc", base_score=0.1,
# subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
# SCORE:
# clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc",
# subsample=0.8, n_estimators=400, scale_pos_weight= 0.1 * 9000/140, )
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=8, eval_metric="auc", tree_method="gpu_hist", gamma=0.5,
subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
%time fit = clasifXGB.fit(data_train, target_train[:,24])
plt.rcParams["figure.figsize"] = (6,3)
plt.plot(fit.feature_importances_.reshape((10,-1)).T[:,:], marker="o")
#plt.plot(fit.feature_importances_)
plt.yscale("log")
plt.legend(labels=range(0,10), loc=(1.1,0))
0
testsize = 200000
target_pred = clasifXGB.predict(data_test[0:testsize],)
target_test[400:500,24], target_pred[400:500]
accuracy = metrics.accuracy_score(target_test[0:testsize, 24], target_pred[:])
accuracy
print(metrics.classification.classification_report(target_test[0:testsize, 24], target_pred[0:testsize]))
clasifXGB.save_model("XGBoostClassifier_modis_veg_kernel9x9gaussian_230features_modelo_azucar_0.91_0.91")
!ls -lha XGBoostClassifier_modis_veg_kernel9x9gaussian_230features_modelo_azucar_0.91_0.91
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
data_dmatrix = xb.DMatrix(data=data_train,label=target_train[:,24])
params = {
'gamma': [0.3, 0.75, 0.5],
'max_depth': [11,10,8],
}
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, eval_metric="auc", tree_method="gpu_hist",
subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
search = GridSearchCV(clasifXGB,
param_grid=params,
scoring='precision',
cv=2, verbose=3)
search.fit(data_train, target_train[:,24])
search.
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
'max_depth': 5, 'alpha': 10}
cv_results = xb.cv(dtrain=data_dmatrix, params=params, nfold=3,
num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
clasifXGB2.fit(data_train, target_train[:,24])
Se convierten los targets desde una matrix de 62 columnas (62 tipos de uso de suelo) a una columna categórica. Se nota que hay puntos con distintos usos de suelo simultáneos y eso complica el análisis. Si se asigna a cada punto una categoría que depende de los cultivos presentes en ese punto, se tienen cientos de categorías distintas. Cada conjunto único de usos de suelo se convierte en una nueva categoría. Debido a esto surgen conjuntos de usos de suelo con pocos casos positivos (menos de 100, por ejemplo) lo que hace imposible que puedan ser usados para predicción.
chars = np.array(list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"))
cultivos = np.argwhere(descripciones>500).reshape((-1)) # [24,12,43]
chars = chars[cultivos]
temp= descripciones[descripciones > 500]
pd.DataFrame({"i": temp.index, "n": temp.values, "l": chars})
chars.shape
targets_labels = [",".join(chars[row==1]) for row in targets.T[:,cultivos]]
targets_labels = np.array(targets_labels)
lblscounts = pd.Series.value_counts(targets_labels)
lblscounts[lblscounts > 100].sum()/len(targets_labels)
lblscounts
Lo mejor será hacer modelos distintos para cada clase por separado.
target_i = 24
targets_preproc = (targets[target_i,:]==1).astype(int)
targets_preproc[(targets_preproc==1) & (targets[:,:].sum(0)>1)] = 2
print(pd.Series.value_counts(targets_preproc))
data_train, data_test, target_train, target_test = train_test_split(data_augmented.T, targets_preproc,
test_size=400000, train_size=200000, random_state=2)
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=8, scale_pos_weight=15,
eval_metric="logloss", tree_method="gpu_hist",
subsample=0.95, n_estimators=400)
fit = clasifXGB.fit(data_train, target_train)
plt.rcParams["figure.figsize"] = (6,3)
plt.plot(fit.feature_importances_.reshape((10,-1)).T[:,:], marker="o")
#plt.plot(fit.feature_importances_)
#plt.yscale("log")
plt.legend(labels=range(0,10), loc=(1.1,0))
0
testsize = 300000
target_pred = clasifXGB.predict(data_test[0:testsize],)
accuracy = metrics.accuracy_score(target_test[0:testsize], target_pred[:])
accuracy
print(metrics.classification.classification_report(target_test[0:testsize]>0, target_pred[0:testsize]>0, digits=4))
Utilizando dos clases para identificar el cultivo: una para identificar puntos que contienen exclusivamente el cultivo y otra para los puntos con varios cultivos a la vez.
Esto incrementa la precisión, pero empeora el recall como se ve abajo:
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=8, scale_pos_weight=10, eval_metric="auc", tree_method="gpu_hist", subsample=0.9, n_estimators=300) precision recall f1-score support False 0.9964 0.9994 0.9979 98371 True 0.9564 0.7815 0.8601 1629 accuracy 0.9959 100000 macro avg 0.9764 0.8904 0.9290 100000 weighted avg 0.9957 0.9959 0.9957 100000 ------------ ídem, subsample = 0.85, scale_pos_weight=(10, 5) precision recall f1-score support False 0.9963 0.9994 0.9979 98371 True 0.9548 0.7778 0.8572 1629 accuracy 0.9958 100000 macro avg 0.9756 0.8886 0.9275 100000 weighted avg 0.9957 0.9958 0.9956 100000 ------------------- clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=10, scale_pos_weight=5, eval_metric="logloss", tree_method="gpu_hist", subsample=0.95, n_estimators=300) precision recall f1-score support 0 0.9964 0.9993 0.9979 98371 1 0.8257 0.8234 0.8246 1076 2 0.5259 0.2568 0.3451 553 accuracy 0.9933 100000 macro avg 0.7827 0.6932 0.7225 100000 weighted avg 0.9920 0.9933 0.9924 100000 --------------------------------- ** usando sólo dos clases clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=8, scale_pos_weight=15, eval_metric="logloss", tree_method="gpu_hist", subsample=0.95, n_estimators=400) precision recall f1-score support 0 0.9986 0.9984 0.9985 295086 1 0.9054 0.9131 0.9092 4914 accuracy 0.9970 300000 macro avg 0.9520 0.9558 0.9539 300000 weighted avg 0.9970 0.9970 0.9970 300000 --------------------- ** usando las 2 clases clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=8, scale_pos_weight=15, eval_metric="logloss", tree_method="gpu_hist", subsample=0.95, n_estimators=400) precision recall f1-score support False 0.9973 0.9992 0.9982 295086 True 0.9441 0.8386 0.8882 4914 accuracy 0.9965 300000 macro avg 0.9707 0.9189 0.9432 300000 weighted avg 0.9964 0.9965 0.9964 300000
target_i = 24
targets_preproc = (targets[target_i,:]==1).astype(int)
#targets_preproc[(targets_preproc==1) & (targets[:,:].sum(0)>1)] = 2
print(pd.Series.value_counts(targets_preproc))
data_train, data_test, target_train, target_test = train_test_split(data_augmented.T, targets_preproc,
test_size=200000, train_size=200000, random_state=2)
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=10, scale_pos_weight=5,
eval_metric="logloss", tree_method="gpu_hist",
subsample=0.95, n_estimators=300)
fit = clasifXGB.fit(data_train, target_train)
plt.rcParams["figure.figsize"] = (6,3)
plt.plot(fit.feature_importances_.reshape((10,-1)).T[:,:], marker="o")
#plt.plot(fit.feature_importances_)
#plt.yscale("log")
plt.legend(labels=range(0,10), loc=(1.1,0))
0
testsize = 300000
target_pred = clasifXGB.predict(data_test[0:testsize],)
accuracy = metrics.accuracy_score(target_test[0:testsize], target_pred[:])
accuracy
print(metrics.classification.classification_report(target_test[0:testsize]>0, target_pred[0:testsize]>0, digits=4))
target_i = 43
targets_preproc = (targets[target_i,:]==1).astype(int)
targets_preproc[(targets_preproc==1) & (targets[:,:].sum(0)>1)] = 2
print(pd.Series.value_counts(targets_preproc))
data_train, data_test, target_train, target_test = train_test_split(data_augmented.T, targets_preproc,
test_size=400000, train_size=400000, random_state=2)
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=10, scale_pos_weight=15,
eval_metric="logloss", tree_method="gpu_hist",
subsample=0.95, n_estimators=300)
fit = clasifXGB.fit(data_train, target_train)
plt.rcParams["figure.figsize"] = (6,3)
plt.plot(fit.feature_importances_.reshape((10,-1)).T[:,:], marker="o")
#plt.plot(fit.feature_importances_)
#plt.yscale("log")
plt.legend(labels=range(0,10), loc=(1.1,0))
0
testsize = 400000
target_pred = clasifXGB.predict(data_test[0:testsize],)
accuracy = metrics.accuracy_score(target_test[0:testsize], target_pred[:])
accuracy
print(metrics.classification.classification_report(target_test[0:testsize]>0, target_pred[0:testsize]>0, digits=4))
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=10, scale_pos_weight=5, eval_metric="logloss", tree_method="gpu_hist", subsample=0.95, n_estimators=300) precision recall f1-score support 0 0.9990 0.9999 0.9994 398734 1 0.8252 0.6866 0.7496 619 2 0.6061 0.3709 0.4602 647 False 0.9990 0.9999 0.9994 398734 True 0.9473 0.6817 0.7928 1266 --------------------- scale_pos_weight=15 precision recall f1-score support False 0.9990 0.9999 0.9994 398734 True 0.9506 0.6840 0.7956 1266 accuracy 0.9989 400000 macro avg 0.9748 0.8420 0.8975 400000 weighted avg 0.9988 0.9989 0.9988 400000
target_i = 43
targets_preproc = (targets[target_i,:]==1).astype(int)
#targets_preproc[(targets_preproc==1) & (targets[:,:].sum(0)>1)] = 2
print(pd.Series.value_counts(targets_preproc))
data_train, data_test, target_train, target_test = train_test_split(data_augmented.T, targets_preproc,
test_size=200000, train_size=200000, random_state=2)
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=10, scale_pos_weight=15,
eval_metric="logloss", tree_method="gpu_hist",
subsample=0.95, n_estimators=400)
fit = clasifXGB.fit(data_train, target_train)
plt.rcParams["figure.figsize"] = (6,3)
plt.plot(fit.feature_importances_.reshape((10,-1)).T[:,:], marker="o")
#plt.plot(fit.feature_importances_)
#plt.yscale("log")
plt.legend(labels=range(0,10), loc=(1.1,0))
0
testsize = 400000
target_pred = clasifXGB.predict(data_test[0:testsize],)
accuracy = metrics.accuracy_score(target_test[0:testsize], target_pred[:])
accuracy
print(metrics.classification.classification_report(target_test[0:testsize]>0, target_pred[0:testsize]>0, digits=4))
Al no utilizar dos clases, disminuye bastante la precisión, aunque aumenta el recall.
target_i = 43
targets_preproc = (targets[target_i,:]==1).astype(int)
#targets_preproc[(targets_preproc==1) & (targets[:,:].sum(0)>1)] = 2
print(pd.Series.value_counts(targets_preproc))
data_train, data_test, target_train, target_test = train_test_split(data_augmented.T, targets_preproc,
test_size=200000, train_size=200000, random_state=2)
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=10, scale_pos_weight=15,
eval_metric="logloss", tree_method="gpu_hist",
subsample=0.95, n_estimators=400)
fit = clasifXGB.fit(data_train, target_train)
plt.rcParams["figure.figsize"] = (6,3)
plt.plot(fit.feature_importances_.reshape((10,-1)).T[:,:], marker="o")
#plt.plot(fit.feature_importances_)
#plt.yscale("log")
plt.legend(labels=range(0,10), loc=(1.1,0))
0
testsize = 400000
target_pred = clasifXGB.predict(data_test[0:testsize],)
accuracy = metrics.accuracy_score(target_test[0:testsize], target_pred[:])
accuracy
print(metrics.classification.classification_report(target_test[0:testsize]>0, target_pred[0:testsize]>0, digits=4))
Los datos ópticos de reflectancia para 4 bandas de MODIS agregados cada 16 días han sido utilizados exitosamente para clasificar el uso del suelo. Usar información de los pixeles cercanos mejora la capacidad del clasificador. El próximo paso será utilizar el producto MOD09A1.006, de las bandas 1 a la 7 y agregado por 8 días.