In [1]:
import xarray as xa
import pandas as pd
import sklearn.decomposition as skdec
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from pyhdf.SD import SD, SDC, SDim
import os
import numpy as np
import pyproj
In [2]:
import geopandas as gp
import sklearn.linear_model as sklm
from sklearn.model_selection import train_test_split
In [3]:
from sklearn import metrics
In [4]:
import xgboost as xb
In [5]:
munis = gp.read_file("../../DATOS/GEO GT/gt-munis-lowq-2017_splits_from_2011.geojson")
In [6]:
deptos = gp.read_file("../../DATOS/GEO GT/gt-deptos-lowq-2017.geojson")
In [7]:
deptos = deptos[deptos.CODIGO!="2300"]
In [8]:
deptos.plot()
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba26df1208>

Cargar datos de MODIS del 2003:

In [9]:
def readHDF(filename, verbose=False):
    _file = SD(filename, SDC.READ)
    _sds_objs = [
        _file.select('500m 16 days EVI'),
        _file.select('500m 16 days red reflectance'),
        _file.select('500m 16 days NIR reflectance'),
        _file.select('500m 16 days blue reflectance'),
        _file.select('500m 16 days MIR reflectance')
    ]
    _datas = [x.get() for x in _sds_objs]
    if verbose:
        print("file: ", filename)
        print("file.info() = ", _file.info())
        print("data.shape = ", _datas[0].shape)
    _file.end()
    return np.array(_datas)
In [10]:
modVegs = []
for f in os.listdir("../../DATOS/MODIS/veg16d"):
    if f.endswith(".hdf") and "2003" in f:
        modVegs.append(readHDF("../../DATOS/MODIS/veg16d/" + f, True))
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003353.h09v07.006.2018222145920.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003033.h09v07.006.2015156052524.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003241.h09v07.006.2015153114124.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003001.h09v07.006.2015153072304.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003225.h09v07.006.2015153114015.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003337.h09v07.006.2015153130209.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003097.h09v07.006.2015158151815.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003049.h09v07.006.2015156080106.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003289.h09v07.006.2015153122733.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003161.h09v07.006.2015159142336.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003145.h09v07.006.2015158154336.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003209.h09v07.006.2015153105814.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003273.h09v07.006.2015153115159.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003193.h09v07.006.2015153105805.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003065.h09v07.006.2015156190603.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003081.h09v07.006.2015156190618.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003257.h09v07.006.2015153113857.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003113.h09v07.006.2015158152156.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003129.h09v07.006.2015158153540.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003321.h09v07.006.2015153125940.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003017.h09v07.006.2015154060403.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003305.h09v07.006.2015153124732.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
file:  ../../DATOS/MODIS/veg16d/MOD13A1.A2003177.h09v07.006.2015161125520.hdf
file.info() =  (12, 6)
data.shape =  (2400, 2400)
In [11]:
modVegs = np.array(modVegs)
In [12]:
modVegs.shape
Out[12]:
(23, 5, 2400, 2400)

La primer dimensión es el tiempo (períodos de 16 días). La segunda dimensión es de las distintas capas de datos, y las otras dos son longitud y latitud.

In [13]:
# Datasets de distintas fechas deben ser distintos:
np.sum(modVegs[20,:,:,:] - modVegs[5,:,:,:])
Out[13]:
148761560

Proyección

In [14]:
sinmodis = pyproj.Proj("+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs")
In [15]:
sinupper = (-10007554.677000,2223901.039333)
sinlower = (-8895604.157333,1111950.519667)
upper = sinmodis(sinupper[0], sinupper[1], inverse=True)
lower = sinmodis(sinlower[0], sinlower[1], inverse=True)
print("Upper bound: ", upper)
print("Lower bound: ", lower)
Upper bound:  (-95.77599951313655, 19.99999999820094)
Lower bound:  (-81.23412894333774, 9.999999999104968)
In [16]:
sina = np.linspace(sinupper[0], sinlower[0], 2400)
sinb = np.linspace(sinupper[1], sinlower[1], 2400)
sinspa, sinspb = np.meshgrid(sina, sinb)
In [17]:
lons, lats = sinmodis(sinspa.flatten(), sinspb.flatten(), inverse=True)
In [18]:
lons = lons.reshape((2400,2400))
lats = lats.reshape((2400,2400))

Dataframe

In [19]:
# En esta ventana está Guatemala
plt.imshow(modVegs[0,0,280:1500,40:1400])
Out[19]:
<matplotlib.image.AxesImage at 0x7fba027c4a90>
In [20]:
modVegs_B = modVegs[:,:,280:1500,40:1400]
modVegs_C = modVegs_B.reshape((-1,modVegs_B.shape[-2], modVegs_B.shape[-1]))
In [21]:
modVegs_C.shape
Out[21]:
(115, 1220, 1360)
In [22]:
# Aplanar:
modVegs_D = modVegs_C.reshape((115, -1))
latsD = lats[280:1500,40:1400].reshape((-1,))
lonsD = lons[280:1500,40:1400].reshape((-1,))
In [23]:
latsD.shape, modVegs_D.shape
Out[23]:
((1659200,), (115, 1659200))

Cargar datos del MAGA

In [24]:
import rasterio as rst
In [25]:
cobveg03 = rst.open("../../DATOS/SEGEPLAN Otros/cobertura_vegetal_uso_tierra_2003_layers.tif")
In [26]:
descs = cobveg03.descriptions
In [27]:
descs = pd.Series(descs)
In [28]:
cobveg03.crs
Out[28]:
CRS.from_epsg(4326)
In [29]:
print("Primeras veinte capas: ", ", ".join(descs[0:20]) + "...")
Primeras veinte capas:  1 Aeropuerto, 2 Agroindustria, 3 Aguacate, 4 Arbustos - matorrales, 5 Arbustos - matorrales, 6 Arena y/o material piroclastico, 7 Arroz, 8 Banano - platano, 9 Bosque conifero, 10 Bosque latifoliado, 11 Bosque mixto, 12 Cacao, 13 Cafe, 14 Cafe - cardamomo, 15 Camaronera y/o salina, 16 Coco, 17 Coplejo industrial, 18 Embalse (reservorio), 19 Frutales deciduos, 20 Granos basicos...
In [30]:
cobveglayers = []
for i in range(1,len(descs)+1):
    cobveglayers.append(cobveg03.read(i))
In [31]:
cobveglayers = np.array(cobveglayers)
In [32]:
ax = plt.subplot(111)
ax.imshow(cobveglayers[24,:,:], origin="bottom", cmap="Blues", extent=(cobveg03.bounds.left, cobveg03.bounds.right, cobveg03.bounds.top, cobveg03.bounds.bottom))
deptos.plot(edgecolor="#888888", color="#ffffff00", ax=ax)
plt.title("Azúcar")
Out[32]:
Text(0.5, 1, 'Azúcar')

Map modis data to maga cobveg data:

In [33]:
%time cobvegXs,cobvegYs = cobveg03.index(lonsD, latsD)
CPU times: user 3.66 s, sys: 65.9 ms, total: 3.73 s
Wall time: 3.74 s
In [34]:
cobveg03.shape
Out[34]:
(2000, 2000)
In [35]:
cobvegXs = np.array(cobvegXs)
cobvegYs = np.array(cobvegYs)
In [36]:
# escogiendo sólo los datos válidos, contenidos en el raster
validData = (np.isnan(cobvegXs)==False) & (np.isnan(cobvegYs)==False) & (cobvegXs> 0) & (cobvegYs>0) & \
               (cobvegXs < cobveg03.shape[0]) & (cobvegYs < cobveg03.shape[1])
latsD = latsD[validData]
lonsD = lonsD[validData]
modVegs_D = modVegs_D[:, validData]
cobvegXs = cobvegXs[validData]
cobvegYs = cobvegYs[validData]
In [37]:
cobvegXs.shape
Out[37]:
(900614,)
In [38]:
targets = cobveglayers[:, cobvegXs, cobvegYs]
In [39]:
targets.shape
Out[39]:
(64, 900614)
In [40]:
pd.options.display.max_rows=100
In [41]:
descripciones = targets.sum(axis=1)
descripciones = pd.Series(descripciones, index=descs)
In [42]:
descripciones
Out[42]:
1 Aeropuerto                                             67
2 Agroindustria                                          96
3 Aguacate                                              389
4 Arbustos - matorrales                              154180
5 Arbustos - matorrales                               87606
6 Arena y/o material piroclastico                       565
7 Arroz                                                 695
8 Banano - platano                                     1902
9 Bosque conifero                                     22402
10 Bosque latifoliado                                214038
11 Bosque mixto                                       57505
12 Cacao                                                 17
13 Cafe                                               42751
14 Cafe - cardamomo                                    1707
15 Camaronera y/o salina                                404
16 Coco                                                   5
17 Coplejo industrial                                   144
18 Embalse (reservorio)                                  85
19 Frutales deciduos                                    191
20 Granos basicos                                    141125
21 Granos basicos                                     17206
22 Hortaliza - ornamental                              3972
23 Campo y/o pista deportiva                             27
24 Canal - drenaje                                      172
25 Cana de azucar                                     14889
26 Cardamomo                                           7019
27 Cementerio                                            82
28 Centros poblados                                    9188
29 Citricos                                             537
30 Melon - sandia con riego                             669
31 Minas descubiertas y superficies de excavacion        77
32 Mosaico de cultivos                                  198
33 Hortaliza - ornamental con riego                      33
34 Huerto                                                81
35 Hule                                                4339
36 Humedal con bosque                                 11992
37 Humedal con otra vegetacion                         3766
38 Instalacion educativa                                112
39 Instalacion militar                                   16
40 Lago - laguna                                       7950
41 Manglar                                             1791
42 Mango                                                488
43 Otros frutales                                        32
44 Palma africana                                      2898
45 Te                                                     5
46 Vivero                                               101
47 Papaya                                                33
48 Parque recreativo                                     19
49 Pastos cultivados                                  34079
50 Pastos naturales y/o yerbazal                      81467
51 Yuca                                                   4
52 Zona inundable                                      8242
53 Zoologico                                              4
54 Pejibaya                                              41
55 Piña                                                 391
56 Plantacion conifera                                 1515
57 Plantacion latifoliada                               786
58 Playa y/o arena                                     1081
59 Prision                                               31
60 Puertos                                               29
61 Rambutan                                              22
62 Rio                                                12075
63 Roca expuesta                                         73
64 Suelo esteril                                          4
dtype: int64
In [43]:
targets_masde100puntos = np.argwhere(descripciones>100).T
/home/guillermo/.local/lib/python3.6/site-packages/numpy/core/fromnumeric.py:61: FutureWarning: Series.nonzero() is deprecated and will be removed in a future version.Use Series.to_numpy().nonzero() instead
  return bound(*args, **kwds)
In [44]:
plt.rcParams["figure.figsize"] = (6,6)
ax = plt.subplot(111)
ax.scatter(lonsD[targets[24,:]==1], latsD[targets[24,:]==1], s= 0.01)
deptos.plot(color="#ffffff00", edgecolor="#888888", ax=ax)
plt.title("Caña de azúcar - de acuerdo a post-procesado")
Out[44]:
Text(0.5, 1, 'Caña de azúcar - de acuerdo a post-procesado')

Explorando patrones en distintos cultivos

Visualizando 5 tipos de vegetación distinta en 5 bandas distintas:

In [123]:
mpl.colors.Colormap("tab10")
plt.rcParams["figure.figsize"] = (10,10)
for j, componente in enumerate(["EVI", "red", "NIR", "blue", "MIR"]):
    plt.subplot(5,1,j+1)
    plt.title("Componente " + componente)
    for i, df in enumerate([24,43,12,9]):
        if i==10: continue
        means = modVegs_D[j:115:5, (targets[df,:]==1) & (modVegs_D.min(axis=0)>0)] .mean(axis=1)
        stds = modVegs_D[j:115:5, (targets[df,:]==1)  & (modVegs_D.min(axis=0)>0)].std(axis=1)
        plt.plot(means, label = descs[df], alpha=0.5, c=mpl.cm.tab10.colors[i]) 
        plt.fill_between(range(0,len(means)), means - stds/1, means  + stds/1, alpha=0.1, color=mpl.cm.tab10.colors[i])
        plt.xticks(rotation=90)
    if j==0:
        plt.legend(loc=(0,1))

plt.tight_layout()
0
Out[123]:
0
In [124]:
from sklearn.decomposition import PCA
In [125]:
pcamodel = PCA()
In [126]:
modVegs_D.shape
Out[126]:
(115, 900614)
In [159]:
pcafit = pcamodel.fit(modVegs_D.T[:,::5])
In [147]:
modVegs_D[:, (targets[24,:]==1)][:,0]
Out[147]:
array([5261,  609, 3637,  387, 1120, 4899,  623, 3425,  382, 1301, 5335,
        555, 3624,  343,  868, 4185,  439, 2626,  293,  770, 5927,  590,
       4110,  374,  992, 5156,  582, 3606,  325, 1004, 5806,  688, 4095,
        474, 1303, 5196,  651, 3605,  440, 1287, 5627,  676, 3912,  479,
       1077, 5889,  818, 4400,  547, 1253, 5657,  658, 3998,  425, 1036,
       4126,  927, 3241,  638,  826, 5613,  611, 3878,  399,  930, 5795,
        774, 4284,  505,  997, 4780,  800, 3593,  505, 1484, 5085,  687,
       3578,  465, 1239, 5364,  658, 3866,  382, 1131, 4282,  902, 3237,
        669, 1540, 4506,  940, 3590,  604, 1586, 5704,  465, 3665,  324,
        930, 4092,  577, 2723,  410, 1067, 5610,  682, 4089,  400, 1199,
       5808,  744, 4271,  474, 1073], dtype=int16)
In [169]:
plt.rcParams["figure.figsize"] = (10,4)
plt.plot(pcafit.components_[0])
Out[169]:
[<matplotlib.lines.Line2D at 0x7fbd44c24dd8>]
In [174]:
pcafit.explained_variance_ratio_
Out[174]:
array([0.82721013, 0.04117885, 0.02444105, 0.01076888, 0.00945845,
       0.00935736, 0.00814287, 0.00737068, 0.00696104, 0.00649675,
       0.0059943 , 0.00567191, 0.00537024, 0.00525701, 0.00482668,
       0.00466834, 0.00391775, 0.00311107, 0.00267024, 0.00212455,
       0.00203278, 0.001693  , 0.00127607])
In [203]:
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[12,:]==1)][::5,0:1000].T), 0.5,axis=0))
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[24,:]==1)][::5,0:1000].T), 0.5,axis=0))
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[43,:]==1)][::5,0:1000].T), 0.5,axis=0))
plt.plot(np.quantile(pcafit.transform( modVegs_D[:, (targets[27,:]==1)][::5,0:1000].T), 0.5,axis=0))
None
In [43]:
data_train, data_test, target_train, target_test = train_test_split(modVegs_D.T, targets.T, 
                                                                test_size=10000, train_size=200000, random_state=1)
In [44]:
data_train, target_train
Out[44]:
(array([[2779,  438, 1875, ..., 3030,  720, 1149],
        [2043, 1109, 2318, ..., 3552,  363, 1277],
        [2676, 1553, 3256, ..., 2673,  723, 2303],
        ...,
        [2893,  226, 1406, ..., 2913,  668, 1180],
        [3613,  306, 2151, ..., 2562,  210,  745],
        [3105,  158, 1643, ..., 2710,  140,  570]], dtype=int16),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int8))
In [45]:
data_test.shape, target_test.shape, data_train.shape, target_train.shape
Out[45]:
((10000, 115), (10000, 64), (200000, 115), (200000, 64))

Kneighbors classifier

In [160]:
from sklearn.neighbors import KNeighborsClassifier
In [161]:
classifierKN = KNeighborsClassifier(n_neighbors=5, n_jobs=4)
classifierKN.fit(data_train, target_train[:,24])
Out[161]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=4, n_neighbors=5, p=2,
                     weights='uniform')
In [162]:
testsize = 10000
target_pred = classifierKN.predict(data_test[0:testsize])
In [164]:
print(metrics.classification_report(target_test[0:testsize,24], target_pred[0:testsize])) #, target_names=descs))
# Sólo con el EVI se obtuvo esto: 
#                    precision    recall  f1-score   support
# 25 Cana de azucar       0.87      0.52      0.65       811
# Usando las 5 componentes (EVI, rojo, NIR, azul, MIR) se obtiene:
#                         0.90      0.74      0.81       140
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9860
           1       0.90      0.74      0.81       140

    accuracy                           1.00     10000
   macro avg       0.95      0.87      0.91     10000
weighted avg       0.99      1.00      0.99     10000

In [ ]:
 

Decision tree:

In [241]:
from sklearn.tree import DecisionTreeClassifier
In [242]:
classifierDT = DecisionTreeClassifier(criterion="entropy")
classifierDT.fit(data_train, target_train[:, 24])
Out[242]:
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
In [243]:
testsize = 10000
target_pred = classifierDT.predict(data_test[0:testsize])
In [244]:
accuracy = metrics.accuracy_score(target_test[0:testsize, 24], target_pred)
In [245]:
accuracy
Out[245]:
0.9912
In [246]:
print(metrics.classification.classification_report(target_test[0:testsize, 24], target_pred[0:testsize])) #, target_names=descs))
# EVI
# 25 Cana de azucar       0.49      0.48      0.49      1683
# EVI y componentes
#                         0.68      0.71      0.69       140
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9844
           1       0.73      0.69      0.71       156

    accuracy                           0.99     10000
   macro avg       0.86      0.84      0.85     10000
weighted avg       0.99      0.99      0.99     10000

Random forest

In [51]:
from sklearn.ensemble import RandomForestClassifier
In [58]:
# Decision trees es más rápido que KN, podemos utilizar un mayor tamño de muestra
data_train, data_test, target_train, target_test = train_test_split(pxdatadf.iloc[:, dias], pxdatadf.iloc[:, ys], 
                                                                test_size=100000, train_size=500000, random_state=1)
In [59]:
classifierRF = RandomForestClassifier(n_jobs=4)
classifierRF.fit(data_train, target_train)
/home/guillermo/.local/lib/python3.6/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Out[59]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
In [60]:
testsize = 100000
target_pred = classifierRF.predict(data_test[0:testsize])
In [61]:
accuracy = metrics.accuracy_score(target_test[0:testsize], target_pred)
In [62]:
accuracy
Out[62]:
0.51276
In [63]:
print(metrics.classification.classification_report(target_test[0:testsize], target_pred[0:testsize], target_names=descs))
                                                   precision    recall  f1-score   support

                                     1 Aeropuerto       0.00      0.00      0.00         4
                                  2 Agroindustria       0.00      0.00      0.00         6
                                       3 Aguacate       0.00      0.00      0.00        43
                          4 Arbustos - matorrales       0.74      0.40      0.52     18452
                          5 Arbustos - matorrales       0.74      0.30      0.43     10343
                6 Arena y/o material piroclastico       1.00      0.07      0.12        60
                                          7 Arroz       0.00      0.00      0.00        72
                               8 Banano - platano       1.00      0.06      0.12       217
                                9 Bosque conifero       0.76      0.10      0.17      2642
                            10 Bosque latifoliado       0.81      0.58      0.68     25160
                                  11 Bosque mixto       0.70      0.23      0.34      6981
                                         12 Cacao       0.00      0.00      0.00         5
                                          13 Cafe       0.87      0.15      0.26      5138
                              14 Cafe - cardamomo       1.00      0.03      0.06       221
                         15 Camaronera y/o salina       0.00      0.00      0.00        27
                                          16 Coco       0.00      0.00      0.00         0
                            17 Coplejo industrial       0.00      0.00      0.00        15
                          18 Embalse (reservorio)       0.00      0.00      0.00         7
                             19 Frutales deciduos       0.00      0.00      0.00        14
                                20 Granos basicos       0.64      0.20      0.31     16845
                                21 Granos basicos       0.72      0.08      0.14      2097
                        22 Hortaliza - ornamental       1.00      0.01      0.02       509
                     23 Campo y/o pista deportiva       0.00      0.00      0.00         2
                               24 Canal - drenaje       0.00      0.00      0.00         2
                                25 Cana de azucar       0.92      0.42      0.58      1683
                                     26 Cardamomo       0.84      0.02      0.04       829
                                    27 Cementerio       0.00      0.00      0.00        10
                              28 Centros poblados       0.88      0.08      0.14      1070
                                      29 Citricos       0.00      0.00      0.00        74
                      30 Melon - sandia con riego       1.00      0.13      0.23        78
31 Minas descubiertas y superficies de excavacion       0.00      0.00      0.00        12
                           32 Mosaico de cultivos       0.00      0.00      0.00        27
              33 Hortaliza - ornamental con riego       0.00      0.00      0.00         5
                                        34 Huerto       0.00      0.00      0.00        10
                                          35 Hule       0.88      0.09      0.16       513
                            36 Humedal con bosque       0.74      0.06      0.12      1413
                   37 Humedal con otra vegetacion       0.75      0.01      0.01       452
                         38 Instalacion educativa       0.00      0.00      0.00        10
                           39 Instalacion militar       0.00      0.00      0.00         2
                                 40 Lago - laguna       0.84      0.11      0.20       512
                                       41 Manglar       0.00      0.00      0.00       135
                                         42 Mango       0.00      0.00      0.00        67
                                43 Otros frutales       0.00      0.00      0.00         1
                                44 Palma africana       1.00      0.09      0.16       364
                                            45 Te       0.00      0.00      0.00         1
                                        46 Vivero       0.00      0.00      0.00        13
                                        47 Papaya       0.00      0.00      0.00         5
                             48 Parque recreativo       0.00      0.00      0.00         2
                             49 Pastos cultivados       0.73      0.10      0.18      3965
                 50 Pastos naturales y/o yerbazal       0.74      0.14      0.24      9546
                                          51 Yuca       0.00      0.00      0.00         1
                                52 Zona inundable       0.46      0.01      0.01       939
                                     53 Zoologico       0.00      0.00      0.00         1
                                      54 Pejibaya       0.00      0.00      0.00         9
                                          55 Piña       0.00      0.00      0.00        31
                           56 Plantacion conifera       0.00      0.00      0.00       182
                        57 Plantacion latifoliada       0.00      0.00      0.00        87
                               58 Playa y/o arena       0.00      0.00      0.00        72
                                       59 Prision       0.00      0.00      0.00         4
                                       60 Puertos       0.00      0.00      0.00         1
                                      61 Rambutan       0.00      0.00      0.00         1
                                           62 Rio       1.00      0.00      0.00      1405
                                 63 Roca expuesta       0.00      0.00      0.00         5
                                 64 Suelo esteril       0.00      0.00      0.00         0

                                        micro avg       0.76      0.30      0.43    112399
                                        macro avg       0.32      0.05      0.08    112399
                                     weighted avg       0.75      0.30      0.40    112399
                                      samples avg       0.28      0.22      0.23    112399

/home/guillermo/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/guillermo/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1439: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
/home/guillermo/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/guillermo/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1439: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
/home/guillermo/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/home/guillermo/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1439: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
/home/guillermo/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels.
  'precision', 'predicted', average, warn_for)
/home/guillermo/.local/lib/python3.6/site-packages/sklearn/metrics/classification.py:1439: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels.
  'recall', 'true', average, warn_for)
In [64]:
del classifierRF

Naive Bayes

In [174]:
import sklearn.naive_bayes as nb
In [175]:
clasifNB = nb.GaussianNB()
In [176]:
clasifNB.fit(data_train, target_train[:,24])
Out[176]:
GaussianNB(priors=None, var_smoothing=1e-09)
In [177]:
testsize = 10000
target_pred = clasifNB.predict(data_test[0:testsize])
In [178]:
accuracy = metrics.accuracy_score(target_test[0:testsize, 24], target_pred[:])
In [179]:
accuracy
Out[179]:
0.7626
In [180]:
print(metrics.classification.classification_report(target_test[0:testsize, 24], target_pred[0:testsize]))
              precision    recall  f1-score   support

           0       1.00      0.76      0.86      9860
           1       0.05      0.95      0.10       140

    accuracy                           0.76     10000
   macro avg       0.53      0.85      0.48     10000
weighted avg       0.99      0.76      0.85     10000

XGBoost

In [47]:
data_train, data_test, target_train, target_test = train_test_split(modVegs_D.T, targets.T, 
                                                                test_size=0.5, train_size=0.3, random_state=2)
In [48]:
data_train.shape, data_test.shape
Out[48]:
((270184, 115), (450307, 115))
In [49]:
# SCORE: 0.85      0.84      0.85 (precision, recall, f1)
#        clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.3, verbosity=2, max_depth=9, min_child_weight=5, eval_metric="auc",
#                            subsample=0.95, n_estimators=200, scale_pos_weight= 0.1 * 9000/140)
# SCORE: 0.90      0.87      0.89  ----  0.90      0.87      0.89 (Con todos los datos)
#        clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc",
#                            subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
# SCORE: 0.88      0.86      0.87
#        clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc", base_score=0.1,
#                            subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
# SCORE: 0.90      0.83      0.86
#        clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc",
#                            subsample=0.8, n_estimators=400, scale_pos_weight= 0.1 * 9000/140, )
clasifXGB = xb.XGBClassifier(n_jobs=4, learning_rate=0.1, verbosity=2, max_depth=9, eval_metric="auc", tree_method="gpu_hist",
                            subsample=0.95, n_estimators=400, scale_pos_weight= 0.1 * 9000/140)
In [ ]:
fit = clasifXGB.fit(data_train, target_train[:,24])
In [53]:
plt.rcParams["figure.figsize"] = (6,3)
plt.plot(fit.feature_importances_.reshape((5,-1)).T, marker="o")
#plt.plot(fit.feature_importances_)
plt.legend(labels=range(0,5))
0
Out[53]:
0

Nota

Estos modelos muestran que las quincenas que más información aportan para la identificación de la caña de azúcar son las de Marzo/Abril y Noviembre/Diciembre. Esto coincide con el final e inicio, respectivamente, de la cosecha de la caña de azúcar.


In [56]:
testsize = 200000
target_pred = clasifXGB.predict(data_test[0:testsize],)
In [57]:
target_test[400:500,24], target_pred[400:500]
Out[57]:
(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int8),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int8))
In [58]:
accuracy = metrics.accuracy_score(target_test[0:testsize, 24], target_pred[:])
In [59]:
accuracy
Out[59]:
0.995595
In [60]:
print(metrics.classification.classification_report(target_test[0:testsize, 24], target_pred[0:testsize]))
# Borré el resultado anterior para el uso de EVI, pero era algo como 0.81, 0.65  , 0.7
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    196760
           1       0.89      0.83      0.86      3240

    accuracy                           1.00    200000
   macro avg       0.94      0.91      0.93    200000
weighted avg       1.00      1.00      1.00    200000

Clasificador deep learning con Tensorflow

In [317]:
data_train, data_test, target_train, target_test = train_test_split(modVegs_D.T, targets.T, 
                                                                test_size=100000, train_size=200000, random_state=2)
In [ ]:
import tensorflow as tf
In [56]:
data_augmented.shape
Out[56]:
(230, 900614)
In [210]:
normmean= np.mean(data_train)
In [455]:
tfmodel = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation="relu", 
                          input_shape=[data_train.shape[1]], 
                          kernel_initializer = tf.keras.initializers.RandomNormal(), 
                          bias_initializer = tf.keras.initializers.RandomNormal()),
    tf.keras.layers.Dense(32, activation="relu", 
                          kernel_initializer = tf.keras.initializers.RandomNormal(),
                          bias_initializer = tf.keras.initializers.RandomNormal()),
    tf.keras.layers.Dense(32, activation="relu", 
                          kernel_initializer = tf.keras.initializers.RandomNormal(),
                          bias_initializer = tf.keras.initializers.RandomNormal()),
    tf.keras.layers.Dense(32, activation="relu", 
                          kernel_initializer = tf.keras.initializers.RandomNormal(),
                          bias_initializer = tf.keras.initializers.RandomNormal()),
    
    tf.keras.layers.Dense(1, activation="sigmoid")
])
In [456]:
tfloss = "binary_crossentropy"
tfopt = tf.keras.optimizers.SGD(0.1)
In [457]:
tfmodel.compile(optimizer=tfopt,
              loss=tfloss )
In [458]:
tfmodel.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_126 (Dense)            (None, 32)                3712      
_________________________________________________________________
dense_127 (Dense)            (None, 32)                1056      
_________________________________________________________________
dense_128 (Dense)            (None, 32)                1056      
_________________________________________________________________
dense_129 (Dense)            (None, 32)                1056      
_________________________________________________________________
dense_130 (Dense)            (None, 1)                 33        
=================================================================
Total params: 6,913
Trainable params: 6,913
Non-trainable params: 0
_________________________________________________________________
In [459]:
tfmodel.optimizer.lr.assign(0.01)
Out[459]:
<tf.Variable 'AssignVariableOp_24' shape=() dtype=float32>
In [ ]:
history = tfmodel.fit(data_train/normmean, target_train[:,24] , validation_split=0.2, batch_size=500, epochs=200, shuffle=True)
In [461]:
plt.rcParams["figure.figsize"] = (4,4)
plt.plot(history.history["val_loss"], label="validation loss")
plt.plot(history.history["loss"], label = "loss")
plt.legend()
None