Scrape de datos de SICOIN
Ir al repositorio (los datos en CSV están allí)
Scraping de datos de SICOIN (2020)¶
SICOIN es el sistema de información de contabilidad del gob. de Guatemala. Es un sitio web hecho en ASP, aparentemente. Tiene muchos reportes sobre ingresos, gastos y el movimiento del dinero en el estado. Como base de datos, es un sistema muy completo, aunque en realidad no dice gran cosa sobre el gasto público. Es muy sencillo para los funcionarios de Guatemala esconder la corrupción y el despilfarro en esos datos porque son datos ambiguos, con categorías amplias de gasto. Aunque creo que así es la contabilidad en general: fácilmente se puede mentir en las cuentas.
Como base de datos es aceptable pero la aplicación web es de lo peor. En primer lugar, no sería necesario hacer scrape de estos datos si SICOIN tuviera una API. Pero no, en lugar de API lo que tiene es frames dentro de frames dentro de frames. Es lo que pasa cuando se trabaja con ASP y .Net y esas cosas (aunque he visto apps decentes con esas tecnologías y esta parece estar hecha a propósito de la peor manera).
Para hacer el scraping uso Selenium con el driver para Google Chrome. He puesto las funciones para conectar con SICOIN en un archivo aparte para organizar mejor las cosas. Al ver el código verán el desastre que tiene SICOIN para acceder al menú principal o a los formularios.
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import os
from SICOIN_scraper import *
import xlrd
import re
import pandas as pd
import time
options = webdriver.ChromeOptions()
options.headless = False # True
# options.add_argument("download.default_directory="+os.getcwd())
options.add_experimental_option("prefs", {
"download.default_directory": os.path.abspath("/tmp/sicoin/"),
"download.prompt_for_download": False,
})
driver = webdriver.Chrome(chrome_options=options, executable_path="../../software/chromedriver")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'})
<ipython-input-5-3104707cb48e>:9: DeprecationWarning: use options instead of chrome_options driver = webdriver.Chrome(chrome_options=options, executable_path="../../software/chromedriver")
{}
driver.get('https://sicoin.minfin.gob.gt')
login(driver)
#driver.close()
# Reporte de gastos por grupos dinámicos
clicks_sequence = ["itemTextLink32", "itemTextLink50", "itemTextLink51", "itemTextLink55"]
def set_grupos():
driver.find_element_by_id("lstCorte").send_keys("PROGRA")
driver.find_element_by_id("lstColumna").send_keys("ACTIVIDAD U")
driver.refresh()
for year in range(2020,2021):
print(year)
setYear(driver, year)
time.sleep(2)
reportTypeA(driver, clicks_sequence, [], other_options=set_grupos)
time.sleep(10)
driver.refresh()
2020 downloading
Parse data¶
# Read all excel files:
xls_files = []
for root, dirs, files in os.walk("/tmp/sicoin/"):
n = len(files)
for i, file in enumerate(files):
if (file.endswith(".xls") or file.endswith(".xlsx")):
print(i, file)
xls_files.append(xlrd.open_workbook(root + "/" + file))
0 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo235554.xls 1 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo233830.xls 2 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo235759.xls 3 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo234333.xls 4 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo233954.xls 5 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo235502.xls 6 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo235838.xls 7 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo235253.xls 8 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo235033.xls 9 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo234211.xls 10 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo235720.xls 11 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo234645.xls 12 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo234509.xls 13 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo235916.xls 14 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo002545.xls 15 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo235638.xls 16 _172.25.3.18reportes$00804768jjp3w1gtgn5fkwy2xbtqxjvo233708.xls
def nextCell(row, offset = 0):
lenr = len(row)
if offset >= lenr:
return None
while (offset+1 < lenr) & (row[offset].ctype == xlrd.XL_CELL_EMPTY):
offset += 1
return offset
def process_row(row, truncate = None):
rowData = []
js = []
j = nextCell(row, 0)
while True:
if j is not None:
if (truncate is not None) and (truncate < j):
break
value = str(row[j].value).strip()
js.append(j)
rowData.append(value)
if j is None:
break
j = nextCell(row, j+1)
return rowData, js
def tryFun(fun, param1):
try:
return fun(param1)
except:
return None
def process_table_simple(sheet, options):
start = options["start"]
if (type(start) == dict):
i = 0
# Hay que buscar la primera aparición de start["value"] en la columna start["col"]
while i<sheet.nrows:
r = sheet.row(i)
if (len(r)>start["col"]) and (r[start["col"]].value == start["value"]):
break
i += 1
elif type(start) == int:
i = start
data = []
while True:
processTitleRow = False
popCategory = False
i += 1
if i >= sheet.nrows:
break
row = sheet.row(i)
values, js = process_row(row, options["truncate"] if "truncate" in options else None)
if len(values)==0:
continue
if values[0].lower().startswith("tota"):
continue
rowData = dict(zip(js, values))
data.append(rowData)
df = pd.DataFrame.from_records(data)
return df
def process_reporte(s):
df = process_table_simple(s, {"start": 19, "truncate": 45})
df = df.sort_index(1)
df.loc[df[2].notna(), 2] = df.loc[df[2].notna(), 2] + " - " + df.loc[df[2].notna(), 10]
df.drop(columns=44, inplace=True)
try:
df.columns = ["CodEntidad", "CodPrograma", "CodActOb","Programa", "ActOb",
"Asignado", "Modificado", "Vigente", "PreCompromiso", "Comprometido", "Devengado",
"Pagado", "SaldoPorComprometer", "SaldoPorDevengar", "SaldoPorPagar", "PorcentajeE"]
except:
print("Error al poner columnas")
print(df.head())
raise ValueError
for col in ["CodEntidad", "CodPrograma", "CodActOb","Programa", "ActOb"]:
df[col] = df[col].fillna(method="ffill")
return df
s = xls_files[0].sheet_by_index(0)
for i in range(0,20):
print(i, "| ", " | ".join([str(x.value)[0:10] for x in s.row(i)]))
0 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1 | | | Sistema de | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 2 | | | Ejecución | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 3 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | PAGINA : | | | 1.0 | | DE | | 56.0 | | | | 4 | | | Ejecucion | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 5 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | FECHA | | | 44157.0 | | | | | | | | 6 | | | Expresado | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 7 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | HORA | | | | 0.99732638 | | | | | | | 8 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 9 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | REPORTE : | | | R00804768. | | | | | | | | 10 | | | - ENTIDAD | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 11 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 12 | | | DEL MES EN | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 13 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 14 | | EJERCICIO: | | | | | | | 2014.0 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 15 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 16 | | | | | | | | | | | DESCRIPCIO | | | | | ASIGNADO | | MODIFICADO | | | VIGENTE | | PRE COMPR | | COMPROMETI | | DEVENGADO | | PAGADO | | SALDO POR | | SALDO POR | | | SALDO POR | | | | | % EJEC | | | | 17 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 18 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 19 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
# Optimizando código para extraer datos
s = xls_files[0].sheet_by_index(0)
%prun process_reporte(s)
455853 function calls (455738 primitive calls) in 0.268 seconds Ordered by: internal time ncalls tottime percall cumtime percall filename:lineno(function) 158400 0.081 0.000 0.111 0.000 sheet.py:403(cell) 3520 0.036 0.000 0.147 0.000 sheet.py:474(<listcomp>) 20480 0.033 0.000 0.035 0.000 <ipython-input-124-27a67d165287>:1(nextCell) 3520 0.030 0.000 0.071 0.000 <ipython-input-124-27a67d165287>:9(process_row) 158400 0.030 0.000 0.030 0.000 sheet.py:2303(__init__) 1 0.019 0.019 0.255 0.255 <ipython-input-124-27a67d165287>:32(process_table_simple) 3520 0.004 0.000 0.151 0.000 sheet.py:470(row) 37318 0.004 0.000 0.004 0.000 {method 'append' of 'list' objects} 27987/27893 0.004 0.000 0.004 0.000 {built-in method builtins.len} 16960 0.002 0.000 0.002 0.000 {method 'strip' of 'str' objects} 21 0.002 0.000 0.002 0.000 {pandas._libs.lib.infer_dtype} 1 0.001 0.001 0.001 0.001 {pandas._libs.lib.dicts_to_array} 3328 0.001 0.000 0.002 0.000 construction.py:637(<genexpr>) 8 0.001 0.000 0.001 0.000 {built-in method pandas._libs.missing.isnaobj} 3520 0.001 0.000 0.001 0.000 {method 'startswith' of 'str' objects} 2 0.001 0.000 0.001 0.000 {pandas._libs.algos.take_2d_axis0_object_object} 1074 0.001 0.000 0.001 0.000 generic.py:10(_check) 2376 0.001 0.000 0.002 0.000 {built-in method builtins.isinstance} 3535 0.001 0.000 0.001 0.000 {method 'lower' of 'str' objects} 1 0.001 0.001 0.002 0.002 {pandas._libs.lib.fast_unique_multiple_list_gen} 27 0.001 0.000 0.001 0.000 {pandas._libs.lib.maybe_convert_objects} 1 0.001 0.001 0.268 0.268 <ipython-input-124-27a67d165287>:64(process_reporte) 1 0.000 0.000 0.000 0.000 construction.py:643(<listcomp>) 1 0.000 0.000 0.268 0.268 <string>:1(<module>) 1 0.000 0.000 0.000 0.000 managers.py:1843(_stack_arrays) 44 0.000 0.000 0.001 0.000 generic.py:5141(__setattr__) 53 0.000 0.000 0.000 0.000 {built-in method numpy.empty} 3328 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects} 1580 0.000 0.000 0.000 0.000 {built-in method builtins.getattr} 163/155 0.000 0.000 0.000 0.000 {built-in method numpy.array} 36 0.000 0.000 0.001 0.000 cast.py:1310(maybe_cast_to_datetime) 5 0.000 0.000 0.000 0.000 managers.py:1043(iset) 5 0.000 0.000 0.001 0.000 missing.py:635(pad_2d) 18 0.000 0.000 0.000 0.000 {method 'reduce' of 'numpy.ufunc' objects} 169 0.000 0.000 0.000 0.000 common.py:1600(_is_dtype_type) 34 0.000 0.000 0.001 0.000 blocks.py:2655(get_block_type) 165 0.000 0.000 0.000 0.000 common.py:1460(is_extension_array_dtype) 36 0.000 0.000 0.001 0.000 cast.py:1201(maybe_infer_to_datetimelike) 16 0.000 0.000 0.000 0.000 {method 'copy' of 'numpy.ndarray' objects} 807 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass} 164 0.000 0.000 0.000 0.000 base.py:413(find) 22 0.000 0.000 0.004 0.000 construction.py:390(sanitize_array) 57 0.000 0.000 0.000 0.000 _dtype.py:319(_name_get) 205 0.000 0.000 0.001 0.000 base.py:256(is_dtype) 23 0.000 0.000 0.001 0.000 series.py:201(__init__) 5 0.000 0.000 0.000 0.000 {built-in method pandas._libs.lib.is_bool_array} 117 0.000 0.000 0.000 0.000 common.py:194(is_object_dtype) 49/47 0.000 0.000 0.000 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function} 36 0.000 0.000 0.000 0.000 {pandas._libs.lib.infer_datetimelike_array} 5/4 0.000 0.000 0.000 0.000 base.py:293(__new__) 4 0.000 0.000 0.001 0.000 indexing.py:1078(_getitem_axis) 31 0.000 0.000 0.000 0.000 blocks.py:124(__init__) 4 0.000 0.000 0.000 0.000 managers.py:238(_rebuild_blknos_and_blklocs) 22 0.000 0.000 0.000 0.000 cast.py:1187(maybe_castable) 86 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_list_like} 22 0.000 0.000 0.001 0.000 construction.py:520(_try_cast) 5 0.000 0.000 0.000 0.000 blocks.py:322(set) 1 0.000 0.000 0.001 0.001 managers.py:1715(form_blocks) 2 0.000 0.000 0.002 0.001 managers.py:1300(_slice_take_blocks_ax0) 10 0.000 0.000 0.001 0.000 frame.py:2869(__getitem__) 57 0.000 0.000 0.000 0.000 numerictypes.py:365(issubdtype) 34 0.000 0.000 0.000 0.000 common.py:224(is_sparse) 9 0.000 0.000 0.002 0.000 missing.py:193(_isna_ndarraylike) 114 0.000 0.000 0.000 0.000 numerictypes.py:293(issubclass_) 9 0.000 0.000 0.003 0.000 managers.py:366(apply) 26 0.000 0.000 0.000 0.000 generic.py:195(__init__) 8 0.000 0.000 0.001 0.000 missing.py:235(_isna_string_dtype) 57 0.000 0.000 0.000 0.000 dtypes.py:906(is_dtype) 27 0.000 0.000 0.000 0.000 numeric.py:290(full) 25 0.000 0.000 0.000 0.000 base.py:4083(__getitem__) 49 0.000 0.000 0.000 0.000 series.py:442(name) 6 0.000 0.000 0.001 0.000 algorithms.py:1616(take_nd) 48 0.000 0.000 0.000 0.000 dtypes.py:1119(is_dtype) 5 0.000 0.000 0.001 0.000 frame.py:3028(__setitem__) 2 0.000 0.000 0.000 0.000 {pandas._libs.lib.clean_index_list} 7 0.000 0.000 0.000 0.000 managers.py:984(iget) 17 0.000 0.000 0.001 0.000 blocks.py:2701(make_block) 18 0.000 0.000 0.000 0.000 generic.py:5095(__finalize__) 61 0.000 0.000 0.000 0.000 common.py:456(is_period_dtype) 10 0.000 0.000 0.001 0.000 generic.py:3532(_get_item_cache) 25 0.000 0.000 0.000 0.000 blocks.py:2374(__init__) 5 0.000 0.000 0.003 0.001 generic.py:5905(fillna) 52 0.000 0.000 0.000 0.000 common.py:492(is_interval_dtype) 87 0.000 0.000 0.000 0.000 {built-in method _abc._abc_instancecheck} 32 0.000 0.000 0.000 0.000 generic.py:5123(__getattr__) 52 0.000 0.000 0.000 0.000 common.py:530(is_categorical_dtype) 5 0.000 0.000 0.002 0.000 missing.py:545(interpolate_2d) 31 0.000 0.000 0.000 0.000 blocks.py:237(mgr_locs) 17 0.000 0.000 0.000 0.000 managers.py:212(shape) 22 0.000 0.000 0.000 0.000 managers.py:1532(__init__) 5 0.000 0.000 0.000 0.000 frame.py:3702(_sanitize_column) 6 0.000 0.000 0.000 0.000 algorithms.py:1487(_get_take_nd_function) 52 0.000 0.000 0.000 0.000 common.py:381(is_datetime64tz_dtype) 137 0.000 0.000 0.000 0.000 common.py:180(<lambda>) 38 0.000 0.000 0.000 0.000 <frozen importlib._bootstrap>:1017(_handle_fromlist) 2 0.000 0.000 0.001 0.000 indexing.py:1042(_getitem_tuple) 1 0.000 0.000 0.005 0.005 construction.py:329(_homogenize) 14 0.000 0.000 0.000 0.000 _ufunc_config.py:39(seterr) 5 0.000 0.000 0.003 0.001 blocks.py:1143(_interpolate_with_fill) 1 0.000 0.000 0.001 0.001 indexing.py:1523(_setitem_with_indexer) 1 0.000 0.000 0.268 0.268 {built-in method builtins.exec} 38 0.000 0.000 0.000 0.000 construction.py:339(extract_array) 137 0.000 0.000 0.000 0.000 common.py:178(classes) 51 0.000 0.000 0.000 0.000 common.py:1733(pandas_dtype) 17 0.000 0.000 0.001 0.000 construction.py:726(convert) 118 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr} 14 0.000 0.000 0.000 0.000 blocks.py:256(make_block_same_class) 1 0.000 0.000 0.005 0.005 construction.py:609(_list_of_dict_to_arrays) 87 0.000 0.000 0.000 0.000 abc.py:96(__instancecheck__) 13 0.000 0.000 0.002 0.000 missing.py:130(_isna) 40 0.000 0.000 0.000 0.000 series.py:492(name) 4 0.000 0.000 0.000 0.000 cast.py:442(maybe_promote) 66 0.000 0.000 0.000 0.000 inference.py:322(is_hashable) 7 0.000 0.000 0.000 0.000 {built-in method numpy.arange} 13 0.000 0.000 0.000 0.000 {method 'get_loc' of 'pandas._libs.index.IndexEngine' objects} 28 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(copyto) 15 0.000 0.000 0.000 0.000 base.py:4036(__contains__) 25 0.000 0.000 0.000 0.000 common.py:696(is_integer_dtype) 1 0.000 0.000 0.000 0.000 function_base.py:4225(delete) 2 0.000 0.000 0.000 0.000 base.py:1646(is_unique) 14 0.000 0.000 0.000 0.000 _ufunc_config.py:139(geterr) 2 0.000 0.000 0.000 0.000 {built-in method _operator.add} 2 0.000 0.000 0.001 0.000 indexing.py:757(_getitem_lowerdim) 5 0.000 0.000 0.001 0.000 frame.py:3109(_set_item) 5 0.000 0.000 0.003 0.001 blocks.py:1088(interpolate) 5 0.000 0.000 0.001 0.000 cast.py:1051(soft_convert_objects) 9 0.000 0.000 0.000 0.000 base.py:463(_simple_new) 7 0.000 0.000 0.000 0.000 frame.py:3184(_box_col_values) 2 0.000 0.000 0.000 0.000 {method 'get_indexer' of 'pandas._libs.index.IndexEngine' objects} 5 0.000 0.000 0.000 0.000 fromnumeric.py:73(_wrapreduction) 18 0.000 0.000 0.000 0.000 missing.py:75(clean_fill_method) 4 0.000 0.000 0.000 0.000 {pandas._libs.algos.take_1d_int64_int64} 54 0.000 0.000 0.000 0.000 common.py:1565(_get_dtype) 1 0.000 0.000 0.000 0.000 construction.py:360(extract_index) 56 0.000 0.000 0.000 0.000 range.py:687(__len__) 70/62 0.000 0.000 0.000 0.000 _asarray.py:16(asarray) 22 0.000 0.000 0.001 0.000 base.py:5559(ensure_index) 3 0.000 0.000 0.000 0.000 indexers.py:348(check_array_indexer) 18 0.000 0.000 0.000 0.000 {method 'reshape' of 'numpy.ndarray' objects} 13 0.000 0.000 0.000 0.000 base.py:2851(get_loc) 37 0.000 0.000 0.000 0.000 common.py:149(cast_scalar_indexer) 15 0.000 0.000 0.000 0.000 series.py:398(_set_axis) 5 0.000 0.000 0.000 0.000 base.py:4196(equals) 4 0.000 0.000 0.000 0.000 missing.py:358(array_equivalent) 2 0.000 0.000 0.000 0.000 numeric.py:2299(array_equal) 1 0.000 0.000 0.001 0.001 frame.py:5324(sort_index) 13 0.000 0.000 0.000 0.000 base.py:4976(_maybe_cast_indexer) 1 0.000 0.000 0.000 0.000 indexing.py:1863(_align_series) 7 0.000 0.000 0.000 0.000 numpy_.py:162(__init__) 5 0.000 0.000 0.000 0.000 generic.py:3568(_iset_item) 5 0.000 0.000 0.000 0.000 shape_base.py:82(atleast_2d) 13 0.000 0.000 0.000 0.000 {built-in method builtins.any} 14 0.000 0.000 0.000 0.000 blocks.py:2728(_extend_blocks) 5 0.000 0.000 0.001 0.000 blocks.py:2388(convert) 3 0.000 0.000 0.000 0.000 blocks.py:340(apply) 31 0.000 0.000 0.000 0.000 blocks.py:135(_check_ndim) 86 0.000 0.000 0.000 0.000 {built-in method builtins.hash} 17 0.000 0.000 0.000 0.000 generic.py:377(_get_axis) 1 0.000 0.000 0.000 0.000 base.py:1578(is_monotonic_increasing) 1 0.000 0.000 0.000 0.000 sorting.py:257(nargsort) 5 0.000 0.000 0.000 0.000 _validators.py:313(validate_fillna_kwargs) 1 0.000 0.000 0.000 0.000 blocks.py:782(setitem) 21 0.000 0.000 0.000 0.000 common.py:1541(_is_dtype) 24 0.000 0.000 0.000 0.000 {method 'ravel' of 'numpy.ndarray' objects} 3 0.000 0.000 0.000 0.000 generic.py:1319(__invert__) 1 0.000 0.000 0.005 0.005 construction.py:498(to_arrays) 9 0.000 0.000 0.000 0.000 common.py:1330(is_bool_dtype) 16 0.000 0.000 0.000 0.000 base.py:5656(maybe_extract_name) 2 0.000 0.000 0.000 0.000 base.py:2957(get_indexer) 2 0.000 0.000 0.002 0.001 managers.py:1238(reindex_indexer) 5 0.000 0.000 0.000 0.000 indexing.py:2126(convert_to_index_sliceable) 6 0.000 0.000 0.000 0.000 common.py:97(is_bool_indexer) 5 0.000 0.000 0.001 0.000 blocks.py:2403(f) 14 0.000 0.000 0.000 0.000 managers.py:1613(internal_values) 3 0.000 0.000 0.000 0.000 numeric.py:50(__new__) 6 0.000 0.000 0.000 0.000 series.py:574(array) 7 0.000 0.000 0.000 0.000 managers.py:1564(from_array) 2 0.000 0.000 0.000 0.000 series.py:820(take) 1 0.000 0.000 0.011 0.011 frame.py:1660(from_records) 13 0.000 0.000 0.000 0.000 common.py:566(is_string_dtype) 1 0.000 0.000 0.000 0.000 base.py:5259(drop) 3 0.000 0.000 0.000 0.000 {built-in method _operator.invert} 54 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_scalar} 2 0.000 0.000 0.000 0.000 array_ops.py:160(arithmetic_op) 8 0.000 0.000 0.000 0.000 managers.py:1553(from_blocks) 42 0.000 0.000 0.000 0.000 _validators.py:208(validate_bool_kwarg) 5 0.000 0.000 0.001 0.000 missing.py:608(_fillna_prep) 5 0.000 0.000 0.000 0.000 frame.py:3722(reindexer) 5 0.000 0.000 0.001 0.000 generic.py:3572(_set_item) 32 0.000 0.000 0.000 0.000 common.py:603(<genexpr>) 42 0.000 0.000 0.000 0.000 managers.py:1575(_block) 2 0.000 0.000 0.001 0.001 blocks.py:1238(take_nd) 4 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects} 22 0.000 0.000 0.000 0.000 managers.py:1602(dtype) 17 0.000 0.000 0.000 0.000 managers.py:1846(_asarray_compat) 1 0.000 0.000 0.000 0.000 indexers.py:210(maybe_convert_indices) 4/2 0.000 0.000 0.001 0.000 indexing.py:864(__getitem__) 2 0.000 0.000 0.000 0.000 array_ops.py:119(na_arithmetic_op) 33 0.000 0.000 0.000 0.000 base.py:567(__len__) 30 0.000 0.000 0.000 0.000 generic.py:365(_get_axis_number) 2 0.000 0.000 0.000 0.000 indexing.py:1133(_convert_to_indexer) 51 0.000 0.000 0.000 0.000 managers.py:214(<genexpr>) 23 0.000 0.000 0.000 0.000 common.py:329(apply_if_callable) 1 0.000 0.000 0.001 0.001 generic.py:4502(_reindex_with_indexers) 5 0.000 0.000 0.003 0.001 managers.py:560(interpolate) 5 0.000 0.000 0.000 0.000 missing.py:587(_cast_values_for_fillna) 2 0.000 0.000 0.000 0.000 cast.py:1570(construct_1d_object_array_from_listlike) 8 0.000 0.000 0.000 0.000 {method 'clear' of 'dict' objects} 4 0.000 0.000 0.000 0.000 managers.py:132(__init__) 14 0.000 0.000 0.000 0.000 {built-in method numpy.seterrobj} 12 0.000 0.000 0.000 0.000 common.py:608(is_dtype_equal) 15 0.000 0.000 0.000 0.000 common.py:422(is_timedelta64_dtype) 5 0.000 0.000 0.000 0.000 numeric.py:225(__contains__) 2 0.000 0.000 0.001 0.000 __init__.py:335(wrapper) 32 0.000 0.000 0.000 0.000 common.py:188(<lambda>) 5 0.000 0.000 0.000 0.000 blocks.py:2380(is_bool) 1 0.000 0.000 0.001 0.001 generic.py:4214(reindex) 13 0.000 0.000 0.000 0.000 base.py:1755(is_floating) 3 0.000 0.000 0.000 0.000 base.py:701(take) 50 0.000 0.000 0.000 0.000 blocks.py:233(mgr_locs) 13 0.000 0.000 0.000 0.000 common.py:595(condition) 3 0.000 0.000 0.000 0.000 indexing.py:2151(check_bool_indexer) 7 0.000 0.000 0.000 0.000 generic.py:3180(_set_as_cached) 14 0.000 0.000 0.000 0.000 series.py:540(_values) 19 0.000 0.000 0.000 0.000 cast.py:1595(construct_1d_ndarray_preserving_na) 5 0.000 0.000 0.001 0.000 blocks.py:2433(<listcomp>) 3 0.000 0.000 0.000 0.000 base.py:554(_engine) 5 0.000 0.000 0.000 0.000 {method 'take' of 'numpy.ndarray' objects} 5 0.000 0.000 0.000 0.000 fromnumeric.py:2277(all) 37 0.000 0.000 0.000 0.000 blocks.py:315(dtype) 8 0.000 0.000 0.000 0.000 common.py:598(is_excluded_dtype) 22 0.000 0.000 0.000 0.000 series.py:427(dtype) 1 0.000 0.000 0.001 0.001 indexing.py:661(__setitem__) 1 0.000 0.000 0.001 0.001 generic.py:3894(_drop_axis) 4 0.000 0.000 0.000 0.000 cast.py:598(_ensure_dtype_type) 7 0.000 0.000 0.000 0.000 _ufunc_config.py:437(__init__) 2 0.000 0.000 0.000 0.000 expressions.py:61(_evaluate_standard) 32 0.000 0.000 0.000 0.000 common.py:183(classes_and_not_datetimelike) 46 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_float} 5 0.000 0.000 0.003 0.001 series.py:4519(fillna) 9 0.000 0.000 0.000 0.000 common.py:1180(needs_i8_conversion) 13 0.000 0.000 0.002 0.000 missing.py:47(isna) 5 0.000 0.000 0.000 0.000 cast.py:1731(validate_numeric_casting) 1 0.000 0.000 0.000 0.000 managers.py:321(_verify_integrity) 5 0.000 0.000 0.001 0.000 blocks.py:2427(_maybe_downcast) 8 0.000 0.000 0.000 0.000 generic.py:3250(_clear_item_cache) 7 0.000 0.000 0.000 0.000 _ufunc_config.py:441(__enter__) 2 0.000 0.000 0.000 0.000 series.py:2740(_construct_result) 3 0.000 0.000 0.001 0.000 generic.py:7127(notna) 9 0.000 0.000 0.000 0.000 common.py:1025(is_datetime_or_timedelta_dtype) 1 0.000 0.000 0.002 0.002 generic.py:3858(drop) 5 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(all) 2 0.000 0.000 0.000 0.000 generic.py:334(_construct_axes_from_arguments) 3 0.000 0.000 0.000 0.000 common.py:218(asarray_tuplesafe) 6 0.000 0.000 0.000 0.000 blocks.py:207(array_values) 3 0.000 0.000 0.000 0.000 managers.py:683(_consolidate_check) 3 0.000 0.000 0.001 0.000 missing.py:255(notna) 1 0.000 0.000 0.001 0.001 managers.py:1675(create_block_manager_from_arrays) 5 0.000 0.000 0.000 0.000 blocks.py:642(should_store) 10 0.000 0.000 0.000 0.000 base.py:544(_reset_identity) 7 0.000 0.000 0.000 0.000 {pandas._libs.internals.get_blkno_placements} 1 0.000 0.000 0.001 0.001 managers.py:1427(take) 7 0.000 0.000 0.000 0.000 blocks.py:319(iget) 17 0.000 0.000 0.000 0.000 managers.py:216(ndim) 8 0.000 0.000 0.000 0.000 {method 'fill' of 'numpy.ndarray' objects} 2 0.000 0.000 0.000 0.000 indexing.py:893(_getbool_axis) 14 0.000 0.000 0.000 0.000 managers.py:163(blknos) 1 0.000 0.000 0.000 0.000 {method 'argsort' of 'numpy.ndarray' objects} 1 0.000 0.000 0.000 0.000 _asarray.py:223(require) 7 0.000 0.000 0.000 0.000 numpy_.py:50(__init__) 8 0.000 0.000 0.000 0.000 {built-in method builtins.sum} 3 0.000 0.000 0.000 0.000 range.py:393(_shallow_copy) 3 0.000 0.000 0.000 0.000 frame.py:441(__init__) 7 0.000 0.000 0.000 0.000 {method 'any' of 'numpy.ndarray' objects} 36 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_object} 2 0.000 0.000 0.000 0.000 array_ops.py:429(maybe_upcast_for_op) 1 0.000 0.000 0.001 0.001 construction.py:732(<listcomp>) 1 0.000 0.000 0.000 0.000 range.py:86(__new__) 7 0.000 0.000 0.000 0.000 _ufunc_config.py:446(__exit__) 4 0.000 0.000 0.000 0.000 blocks.py:244(make_block) 2 0.000 0.000 0.001 0.000 common.py:50(new_method) 28 0.000 0.000 0.000 0.000 {built-in method numpy.geterrobj} 8 0.000 0.000 0.000 0.000 range.py:452(equals) 5 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(atleast_2d) 5 0.000 0.000 0.000 0.000 cast.py:1700(convert_scalar_for_putitemlike) 2 0.000 0.000 0.000 0.000 indexing.py:925(_is_scalar_access) 3 0.000 0.000 0.000 0.000 series.py:750(__array__) 7 0.000 0.000 0.000 0.000 {built-in method builtins.all} 3 0.000 0.000 0.000 0.000 inference.py:360(is_sequence) 6 0.000 0.000 0.000 0.000 construction.py:580(is_empty_data) 18 0.000 0.000 0.000 0.000 generic.py:232(attrs) 1 0.000 0.000 0.001 0.001 frame.py:4017(reindex) 32 0.000 0.000 0.000 0.000 {built-in method builtins.callable} 3 0.000 0.000 0.000 0.000 blocks.py:350(_split_op_result) 46 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_bool} 1 0.000 0.000 0.006 0.006 construction.py:60(arrays_to_mgr) 2 0.000 0.000 0.000 0.000 managers.py:2015(_preprocess_slice_or_indexer) 5 0.000 0.000 0.000 0.000 {method 'view' of 'numpy.ndarray' objects} 28 0.000 0.000 0.000 0.000 multiarray.py:1043(copyto) 16 0.000 0.000 0.000 0.000 {method 'get' of 'dict' objects} 5 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(can_cast) 3 0.000 0.000 0.000 0.000 range.py:153(_data) 10 0.000 0.000 0.000 0.000 common.py:905(is_datetime64_any_dtype) 2 0.000 0.000 0.000 0.000 numeric.py:105(_shallow_copy) 1 0.000 0.000 0.000 0.000 construction.py:651(_validate_or_indexify_columns) 1 0.000 0.000 0.000 0.000 generic.py:3949(_update_inplace) 1 0.000 0.000 0.001 0.001 frame.py:3871(_reindex_axes) 4 0.000 0.000 0.000 0.000 managers.py:138(<listcomp>) 8 0.000 0.000 0.000 0.000 common.py:1265(is_string_like_dtype) 2 0.000 0.000 0.000 0.000 expressions.py:217(evaluate) 9 0.000 0.000 0.000 0.000 indexers.py:52(is_list_like_indexer) 1 0.000 0.000 0.000 0.000 indexing.py:719(_convert_tuple) 10 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x907780} 1 0.000 0.000 0.000 0.000 base.py:3291(reindex) 8 0.000 0.000 0.000 0.000 inference.py:185(is_array_like) 2 0.000 0.000 0.000 0.000 base.py:4717(_maybe_promote) 1 0.000 0.000 0.000 0.000 generic.py:562(_set_axis) 5 0.000 0.000 0.000 0.000 common.py:1296(is_float_dtype) 3 0.000 0.000 0.001 0.000 series.py:4813(notna) 1 0.000 0.000 0.000 0.000 base.py:5726(_maybe_cast_data_without_dtype) 5 0.000 0.000 0.000 0.000 common.py:750(is_signed_integer_dtype) 3 0.000 0.000 0.000 0.000 generic.py:5197(_protect_consolidate) 2 0.000 0.000 0.000 0.000 generic.py:3213(_maybe_update_cacher) 5 0.000 0.000 0.000 0.000 frame.py:3162(_ensure_valid_index) 2 0.000 0.000 0.000 0.000 base.py:498(_shallow_copy) 2 0.000 0.000 0.000 0.000 indexing.py:709(_is_nested_tuple_indexer) 1 0.000 0.000 0.000 0.000 indexing.py:588(_get_setitem_indexer) 17 0.000 0.000 0.000 0.000 base.py:3870(_values) 7 0.000 0.000 0.000 0.000 _methods.py:44(_any) 1 0.000 0.000 0.001 0.001 frame.py:3908(_reindex_columns) 2 0.000 0.000 0.000 0.000 common.py:150(ensure_python_int) 6 0.000 0.000 0.000 0.000 indexing.py:866(<genexpr>) 1 0.000 0.000 0.000 0.000 common.py:241(index_labels_to_array) 8 0.000 0.000 0.000 0.000 _asarray.py:88(asanyarray) 3 0.000 0.000 0.000 0.000 _internal.py:830(npy_ctypes_check) 16 0.000 0.000 0.000 0.000 {pandas._libs.lib.item_from_zerodim} 8 0.000 0.000 0.000 0.000 common.py:1293(<lambda>) 5 0.000 0.000 0.000 0.000 indexing.py:237(loc) 4 0.000 0.000 0.000 0.000 {method 'all' of 'numpy.ndarray' objects} 2 0.000 0.000 0.000 0.000 generic.py:3369(xs) 7 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_int64} 5 0.000 0.000 0.000 0.000 common.py:348(is_datetime64_dtype) 4 0.000 0.000 0.000 0.000 generic.py:471(ndim) 1 0.000 0.000 0.000 0.000 sorting.py:392(ensure_key_mapped) 1 0.000 0.000 0.000 0.000 base.py:3860(array) 4 0.000 0.000 0.000 0.000 numpy_.py:210(__array__) 1 0.000 0.000 0.000 0.000 generic.py:4490(_needs_reindex_multi) 4 0.000 0.000 0.000 0.000 indexing.py:2258(is_label_like) 5 0.000 0.000 0.000 0.000 frame.py:568(axes) 7 0.000 0.000 0.000 0.000 generic.py:3609(_check_setitem_copy) 1 0.000 0.000 0.000 0.000 indexing.py:627(_ensure_listlike_indexer) 7 0.000 0.000 0.000 0.000 managers.py:675(is_consolidated) 1 0.000 0.000 0.000 0.000 range.py:134(_simple_new) 3 0.000 0.000 0.000 0.000 numpy_.py:414(to_numpy) 14 0.000 0.000 0.000 0.000 managers.py:179(blklocs) 1 0.000 0.000 0.000 0.000 range.py:697(__getitem__) 5 0.000 0.000 0.000 0.000 managers.py:1071(value_getitem) 5 0.000 0.000 0.000 0.000 base.py:520(is_) 20 0.000 0.000 0.000 0.000 {method 'pop' of 'dict' objects} 5 0.000 0.000 0.000 0.000 generic.py:447(_info_axis) 1 0.000 0.000 0.000 0.000 _asarray.py:300(<setcomp>) 1 0.000 0.000 0.001 0.001 managers.py:1812(_simple_blockify) 1 0.000 0.000 0.000 0.000 managers.py:220(set_axis) 2 0.000 0.000 0.000 0.000 __init__.py:307(_align_method_SERIES) 5 0.000 0.000 0.000 0.000 base.py:3896(_get_engine_target) 2 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(array_equal) 1 0.000 0.000 0.000 0.000 base.py:5650(default_index) 5 0.000 0.000 0.000 0.000 frame.py:1099(__len__) 5 0.000 0.000 0.000 0.000 managers.py:977(_consolidate_inplace) 15 0.000 0.000 0.000 0.000 numeric.py:150(is_all_dates) 3 0.000 0.000 0.000 0.000 managers.py:156(from_blocks) 7 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_platform_int} 1 0.000 0.000 0.000 0.000 _validators.py:218(validate_axis_style_args) 14 0.000 0.000 0.000 0.000 blocks.py:201(internal_values) 4 0.000 0.000 0.000 0.000 array_ops.py:404(maybe_upcast_datetimelike_array) 15 0.000 0.000 0.000 0.000 series.py:381(_constructor) 1 0.000 0.000 0.002 0.002 frame.py:4038(drop) 5 0.000 0.000 0.000 0.000 fromnumeric.py:74(<dictcomp>) 2 0.000 0.000 0.000 0.000 indexing.py:1057(_get_label) 2 0.000 0.000 0.000 0.000 common.py:1123(is_datetimelike_v_numeric) 14 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_integer} 2 0.000 0.000 0.000 0.000 expressions.py:195(_bool_arith_check) 2 0.000 0.000 0.000 0.000 generic.py:5211(f) 9 0.000 0.000 0.000 0.000 managers.py:385(<dictcomp>) 2 0.000 0.000 0.000 0.000 missing.py:136(dispatch_fill_zeros) 3 0.000 0.000 0.000 0.000 managers.py:684(<listcomp>) 2 0.000 0.000 0.000 0.000 generic.py:4447(<genexpr>) 2 0.000 0.000 0.000 0.000 __init__.py:88(get_op_result_name) 3 0.000 0.000 0.000 0.000 base.py:669(size) 7 0.000 0.000 0.000 0.000 base.py:590(dtype) 6 0.000 0.000 0.000 0.000 indexing.py:715(<genexpr>) 1 0.000 0.000 0.000 0.000 base.py:5206(delete) 10 0.000 0.000 0.000 0.000 base.py:1378(nlevels) 5 0.000 0.000 0.000 0.000 common.py:283(is_null_slice) 1 0.000 0.000 0.000 0.000 fromnumeric.py:55(_wrapfunc) 2 0.000 0.000 0.000 0.000 common.py:806(is_unsigned_integer_dtype) 4 0.000 0.000 0.000 0.000 {built-in method pandas._libs.missing.checknull} 1 0.000 0.000 0.000 0.000 __init__.py:111(_maybe_match_name) 8 0.000 0.000 0.000 0.000 base.py:1175(name) 2 0.000 0.000 0.000 0.000 expressions.py:186(_has_bool_dtype) 2 0.000 0.000 0.000 0.000 generic.py:382(_get_block_manager_axis) 3 0.000 0.000 0.000 0.000 missing.py:665(clean_reindex_fill_method) 1 0.000 0.000 0.000 0.000 indexers.py:91(is_empty_indexer) 1 0.000 0.000 0.000 0.000 base.py:5672(_maybe_cast_with_dtype) 2 0.000 0.000 0.000 0.000 {built-in method builtins.max} 1 0.000 0.000 0.000 0.000 indexers.py:68(is_scalar_indexer) 13 0.000 0.000 0.000 0.000 numeric.py:237(inferred_type) 2 0.000 0.000 0.000 0.000 {method 'max' of 'numpy.ndarray' objects} 1 0.000 0.000 0.000 0.000 managers.py:533(setitem) 1 0.000 0.000 0.000 0.000 numeric.py:166(ones) 1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(delete) 10 0.000 0.000 0.000 0.000 missing.py:554(<lambda>) 2 0.000 0.000 0.000 0.000 generic.py:5208(_consolidate_inplace) 1 0.000 0.000 0.001 0.001 _decorators.py:307(wrapper) 3 0.000 0.000 0.000 0.000 {method 'pop' of 'list' objects} 2 0.000 0.000 0.000 0.000 series.py:838(_take_with_is_copy) 8 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_iterator} 3 0.000 0.000 0.000 0.000 indexing.py:663(<genexpr>) 1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(ndim) 2 0.000 0.000 0.000 0.000 indexing.py:1774(<genexpr>) 2 0.000 0.000 0.000 0.000 common.py:1509(is_complex_dtype) 8 0.000 0.000 0.000 0.000 {method 'items' of 'dict' objects} 4 0.000 0.000 0.000 0.000 _methods.py:47(_all) 1 0.000 0.000 0.000 0.000 common.py:190(all_none) 2 0.000 0.000 0.000 0.000 generic.py:372(_get_axis_name) 2 0.000 0.000 0.000 0.000 _methods.py:28(_amax) 2 0.000 0.000 0.000 0.000 generic.py:362(<dictcomp>) 3 0.000 0.000 0.000 0.000 indexing.py:1908(<genexpr>) 1 0.000 0.000 0.000 0.000 indexing.py:2226(maybe_convert_ix) 2 0.000 0.000 0.000 0.000 base.py:1685(is_boolean) 3 0.000 0.000 0.000 0.000 {built-in method builtins.iter} 2 0.000 0.000 0.000 0.000 base.py:1720(is_integer) 1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(concatenate) 1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(nonzero) 1 0.000 0.000 0.000 0.000 managers.py:703(is_view) 1 0.000 0.000 0.001 0.001 construction.py:709(_convert_object_array) 1 0.000 0.000 0.000 0.000 indexing.py:1891(<listcomp>) 2 0.000 0.000 0.000 0.000 indexing.py:1886(ravel) 5 0.000 0.000 0.000 0.000 fromnumeric.py:2273(_all_dispatcher) 2 0.000 0.000 0.000 0.000 indexers.py:84(<genexpr>) 2 0.000 0.000 0.000 0.000 dispatch.py:11(should_extension_dispatch) 1 0.000 0.000 0.000 0.000 base.py:1182(name) 3 0.000 0.000 0.000 0.000 indexers.py:109(<genexpr>) 1 0.000 0.000 0.000 0.000 generic.py:5238(_is_mixed_type) 6 0.000 0.000 0.000 0.000 {method 'extend' of 'list' objects} 1 0.000 0.000 0.000 0.000 generic.py:3589(_check_is_chained_assignment_possible) 1 0.000 0.000 0.000 0.000 {built-in method builtins.sorted} 1 0.000 0.000 0.000 0.000 base.py:2000(inferred_type) 2 0.000 0.000 0.000 0.000 blocks.py:229(fill_value) 1 0.000 0.000 0.000 0.000 generic.py:5240(<lambda>) 2 0.000 0.000 0.000 0.000 managers.py:323(<genexpr>) 2 0.000 0.000 0.000 0.000 construction.py:638(<genexpr>) 1 0.000 0.000 0.000 0.000 generic.py:3742(_is_view) 1 0.000 0.000 0.000 0.000 base.py:573(__array__) 1 0.000 0.000 0.000 0.000 managers.py:688(is_mixed_type) 3 0.000 0.000 0.000 0.000 base.py:637(ndim) 2 0.000 0.000 0.000 0.000 managers.py:961(consolidate) 2 0.000 0.000 0.000 0.000 construction.py:682(<genexpr>) 5 0.000 0.000 0.000 0.000 shape_base.py:78(_atleast_2d_dispatcher) 1 0.000 0.000 0.000 0.000 {method 'update' of 'dict' objects} 1 0.000 0.000 0.000 0.000 common.py:211(count_not_none) 4 0.000 0.000 0.000 0.000 numeric.py:81(_validate_dtype) 5 0.000 0.000 0.000 0.000 series.py:1109(_is_mixed_type) 5 0.000 0.000 0.000 0.000 multiarray.py:469(can_cast) 2 0.000 0.000 0.000 0.000 managers.py:233(_is_single_block) 2 0.000 0.000 0.000 0.000 function.py:48(__call__) 1 0.000 0.000 0.000 0.000 indexers.py:117(check_setitem_lengths) 1 0.000 0.000 0.000 0.000 fromnumeric.py:1759(nonzero) 3 0.000 0.000 0.000 0.000 managers.py:1680(<genexpr>) 1 0.000 0.000 0.000 0.000 base.py:4275(identical) 1 0.000 0.000 0.000 0.000 managers.py:1852(_shape_compat) 1 0.000 0.000 0.000 0.000 base.py:66(_reset_cache) 4 0.000 0.000 0.000 0.000 base.py:3105(_get_partial_string_timestamp_match_key) 1 0.000 0.000 0.000 0.000 indexing.py:100(iloc) 2 0.000 0.000 0.000 0.000 frame.py:421(_constructor) 2 0.000 0.000 0.000 0.000 numeric.py:2295(_array_equal_dispatcher) 1 0.000 0.000 0.000 0.000 blocks.py:311(shape) 1 0.000 0.000 0.000 0.000 range.py:214(start) 3 0.000 0.000 0.000 0.000 common.py:215(<genexpr>) 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} 1 0.000 0.000 0.000 0.000 base.py:5623(ensure_has_len) 2 0.000 0.000 0.000 0.000 common.py:194(<genexpr>) 2 0.000 0.000 0.000 0.000 base.py:1385(_sort_levels_monotonic) 1 0.000 0.000 0.000 0.000 blocks.py:180(is_view) 1 0.000 0.000 0.000 0.000 fromnumeric.py:2986(ndim) 1 0.000 0.000 0.000 0.000 indexing.py:2241(is_nested_tuple) 1 0.000 0.000 0.000 0.000 {built-in method numpy.core._multiarray_umath.normalize_axis_index} 1 0.000 0.000 0.000 0.000 {method 'upper' of 'str' objects} 3 0.000 0.000 0.000 0.000 base.py:561(<lambda>) 2 0.000 0.000 0.000 0.000 indexing.py:913(_validate_key) 2 0.000 0.000 0.000 0.000 blocks.py:2435(_can_hold_element) 1 0.000 0.000 0.000 0.000 base.py:3275(_can_reindex) 1 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects} 1 0.000 0.000 0.000 0.000 managers.py:259(items) 1 0.000 0.000 0.000 0.000 fromnumeric.py:2982(_ndim_dispatcher) 1 0.000 0.000 0.000 0.000 numpy_.py:318(_values_for_argsort) 1 0.000 0.000 0.000 0.000 numpy_.py:203(dtype) 1 0.000 0.000 0.000 0.000 indexing.py:922(_has_valid_setitem_indexer) 1 0.000 0.000 0.000 0.000 function_base.py:4221(_delete_dispatcher) 1 0.000 0.000 0.000 0.000 range.py:260(step) 1 0.000 0.000 0.000 0.000 multiarray.py:145(concatenate) 1 0.000 0.000 0.000 0.000 fromnumeric.py:1755(_nonzero_dispatcher) 1 0.000 0.000 0.000 0.000 range.py:237(stop)
# Probando
df = process_reporte(s)
#
df[df.Asignado.notna()].head()
CodEntidad | CodPrograma | CodActOb | Programa | ActOb | Asignado | Modificado | Vigente | PreCompromiso | Comprometido | Devengado | Pagado | SaldoPorComprometer | SaldoPorDevengar | SaldoPorPagar | PorcentajeE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 11130003 – PRESIDENCIA DE LA REPÚBLICA | 11 | 11 00 000 001 000 | GESTIÓN GUBERNAMENTAL | DESPACHO PRESIDENCIAL | 8113671.0 | -1927157.0 | 6186514.0 | 0.0 | 6177429.51 | 6177429.51 | 6150508.51 | 9084.49 | 9084.49 | 26921.0 | 99.85315655957459 |
7 | 11130003 – PRESIDENCIA DE LA REPÚBLICA | 12 | 12 00 000 001 000 | GESTIÓN VICEPRESIDENCIAL | DESPACHO VICEPRESIDENCIAL | 23034990.0 | -1002969.0 | 22032021.0 | 0.0 | 15050230.76 | 15050230.76 | 14935931.17 | 6981790.24 | 6981790.24 | 114299.59 | 68.31071357457402 |
12 | 11130003 – PRESIDENCIA DE LA REPÚBLICA | 13 | 13 00 000 001 000 | ASUNTOS ADMINISTRATIVOS Y DE SEGURIDAD | SERVICIOS ADMINISTRATIVOS Y DE SEGURIDAD | 156088079.0 | 1422802.0 | 157510881.0 | 0.0 | 157386954.31 | 157386954.31 | 156075517.87 | 123926.69 | 123926.69 | 1311436.44 | 99.92132182283966 |
17 | 11130003 – PRESIDENCIA DE LA REPÚBLICA | 14 | 14 00 000 001 000 | SEGURIDAD PERIMETRAL | SEGURIDAD MOVIL PERMANENTE | 17950619.0 | 0.0 | 17950619.0 | 0.0 | 17929045.37 | 17929045.37 | 17419397.91 | 21573.63 | 21573.63 | 509647.46 | 99.87981679071903 |
22 | 11130003 – PRESIDENCIA DE LA REPÚBLICA | 17 | 17 00 000 001 000 | EMBAJADA EXTRAORDINARIA ITINERANTE DE ASUNTOS … | DIRECCIÓN Y COORDINACIÓN | 1179400.0 | -1179400.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
dfs = pd.DataFrame()
files = {}
# Para todos los archivos:
for i, x in enumerate(xls_files):
#try:
s = x.sheet_by_index(0)
year = 0
year = s.row(14)[8].value
print("Procesando ", year)
# check duplicados
if (year in files):
print("se encontró un duplicado", year)
continue
files[year] = True
temp = process_reporte(s)
if temp is None:
print(i, "Empty report")
else:
temp["Year"] = year
dfs = dfs.append(temp, ignore_index = True)
Procesando 2014.0 se encontró un duplicado 2014.0 Procesando 2005.0 Procesando 2017.0 Procesando 2008.0 Procesando 2006.0 Procesando 2013.0 Procesando 2018.0 Procesando 2012.0 Procesando 2011.0 Procesando 2007.0 Procesando 2016.0 Procesando 2010.0 Procesando 2009.0 Procesando 2019.0 Procesando 2020.0 Procesando 2015.0 Procesando 2004.0
dfs = dfs[dfs.Asignado.notna()]
dfs.shape
(16573, 17)
#dfs.to_csv("sicoin_programas_acts_2004-2020.csv")
dfs.to_csv("sicoin_programas_acts_2004-2020.csv.gz")
!ls -lha
total 5.5M drwxrwxr-x 4 guillermo guillermo 4.0K nov 23 22:55 . drwxrwxr-x 6 guillermo guillermo 4.0K nov 22 15:56 .. -rw-rw-r-- 1 guillermo guillermo 29K nov 23 22:55 'explorar datos.ipynb' drwxrwxr-x 2 guillermo guillermo 4.0K nov 23 22:31 .ipynb_checkpoints drwxrwxr-x 2 guillermo guillermo 4.0K nov 22 23:36 __pycache__ -rw-rw-r-- 1 guillermo guillermo 75K nov 23 22:43 scrape_presupuestos.ipynb -rw-rw-r-- 1 guillermo guillermo 4.4M nov 23 22:43 sicoin_programas_acts_2004-2020.csv -rw-rw-r-- 1 guillermo guillermo 1006K nov 23 22:57 sicoin_programas_acts_2004-2020.csv.gz -rw-rw-r-- 1 guillermo guillermo 4.1K nov 22 16:47 SICOIN_scraper.py