Skip to content

Commit 1b2c4dc

Browse files
committed
time add
1 parent a42c676 commit 1b2c4dc

3 files changed

Lines changed: 49 additions & 7 deletions

File tree

Gold/AppData.parquet

-15.1 MB
Binary file not shown.

Statistical Analysis/Analysis Report.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,10 @@ <h2>Linear Regression Model Summary</h2>
100100
<th>Method:</th> <td>Least Squares</td> <th> F-statistic: </th> <td> 1372.</td>
101101
</tr>
102102
<tr>
103-
<th>Date:</th> <td>dom, 23 jun 2024</td> <th> Prob (F-statistic):</th> <td> 0.00</td>
103+
<th>Date:</th> <td>qui, 27 jun 2024</td> <th> Prob (F-statistic):</th> <td> 0.00</td>
104104
</tr>
105105
<tr>
106-
<th>Time:</th> <td>21:12:14</td> <th> Log-Likelihood: </th> <td> 7860.5</td>
106+
<th>Time:</th> <td>15:50:16</td> <th> Log-Likelihood: </th> <td> 7860.5</td>
107107
</tr>
108108
<tr>
109109
<th>No. Observations:</th> <td> 5564</td> <th> AIC: </th> <td>-1.572e+04</td>

backend.py

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import duckdb as ddb
1414
import geobr
1515
import geopandas as gpd
16+
import time
1617

1718
logging.basicConfig(level = logging.INFO
1819
, format = '%(asctime)s - %(levelname)s - %(message)s')
@@ -33,13 +34,16 @@ def __init__(self
3334

3435
def create_folders(self) -> None:
3536
"""Create required folders as layer directories."""
37+
start_time = time.time()
3638
folders = [self.bronze_folder
3739
, self.silver_folder
3840
, self.gold_folder
3941
, self.statistical_analysis_folder]
4042
for folder in folders:
4143
os.makedirs(folder
4244
, exist_ok = True)
45+
elapsed_time = time.time() - start_time # End timing
46+
logging.info(f"Created folders in {elapsed_time:.2f} seconds")
4347

4448
def saving_step(self
4549
, df : pd.DataFrame
@@ -56,10 +60,13 @@ def saving_step(self
5660
Returns:
5761
File: Saved file at layer directory.
5862
"""
63+
start_time = time.time()
5964
path = os.path.join(folder
6065
, filename)
6166
df.to_parquet(path
6267
, engine = 'pyarrow')
68+
elapsed_time = time.time() - start_time
69+
logging.info(f"Saved file {filename} in {elapsed_time:.2f} seconds")
6370

6471
def bronze_fetch(self
6572
, series : str
@@ -78,6 +85,7 @@ def bronze_fetch(self
7885
Returns:
7986
DataFrame: Fetched data as pandas DataFrame, then saving at Bronze layer, or None if an error occurs (with an error log).
8087
"""
88+
start_time = time.time()
8189
try:
8290
# Special handling for IDHM 2010 (IPEAdataR)
8391
if filename == 'IDHM_2010.parquet':
@@ -101,6 +109,8 @@ def bronze_fetch(self
101109
self.saving_step(raw_data
102110
, self.bronze_folder
103111
, filename)
112+
elapsed_time = time.time() - start_time
113+
logging.info(f"Bronze fetching {filename} in {elapsed_time:.2f} seconds")
104114
return raw_data
105115
except Exception as e:
106116
logging.error(f'Error fetching data for {filename}: {e}')
@@ -119,6 +129,7 @@ def silver_transform(self
119129
Returns:
120130
DataFrame: Processed data as pandas DataFrame, then saving at Silver layer, or None if an error occurs (with an error log).
121131
"""
132+
start_time = time.time()
122133
try:
123134
if 'IDHM_2010.parquet' in filename:
124135
date_filter = pd.to_datetime('2010-01-01')
@@ -164,6 +175,8 @@ def silver_transform(self
164175
self.saving_step(transf_df
165176
, self.silver_folder
166177
, filename)
178+
elapsed_time = time.time() - start_time
179+
logging.info(f"Silver transforming {filename} in {elapsed_time:.2f} seconds")
167180
return transf_df
168181
except Exception as e:
169182
logging.error(f'Error transforming data for {filename}: {e}')
@@ -180,6 +193,7 @@ def gold_finish(self
180193
Returns:
181194
DataFrame: Processed data as a single pandas DataFrame, then saving at Gold layer and DuckDB, or None if an error occurs. Also, Descriptive Summary as a csv file, then saving at Statistical Analysis folder, or None if an error occurs.
182195
"""
196+
start_time = time.time()
183197
try:
184198
df = self.join_list[0]
185199
for transf_df in self.join_list[1:]:
@@ -219,7 +233,8 @@ def gold_finish(self
219233
conn = ddb.connect(self.db_path)
220234
conn.execute('CREATE TABLE IF NOT EXISTS df AS SELECT * FROM df')
221235
conn.close()
222-
236+
elapsed_time = time.time() - start_time
237+
logging.info(f"Gold finishing {filename} in {elapsed_time:.2f} seconds")
223238
return df
224239
except Exception as e:
225240
logging.error(f'Error finalizing data for {filename}: {e}')
@@ -244,6 +259,7 @@ def process_data(self
244259
if bronze_fetch is done, and silver_transform isn't, do silver_transform,\n
245260
if silver_transform is done, prepare pandas DataFrame to be processed at gold_finish.
246261
"""
262+
start_time = time.time()
247263
bronze_df = self.bronze_fetch(series
248264
, year
249265
, filename
@@ -253,6 +269,8 @@ def process_data(self
253269
, filename)
254270
if silver_df is not None:
255271
self.join_list.append(silver_df)
272+
elapsed_time = time.time() - start_time
273+
logging.info(f"Processing data {filename} in {elapsed_time:.2f} seconds")
256274

257275
def analyze_data(self
258276
, df : pd.DataFrame) -> None:
@@ -266,6 +284,7 @@ def analyze_data(self
266284
Statistical Model calculations and conversion to HTML.\n
267285
Correlation Matrix, Linear Regression and ANOVA, all saved in a single HTML file at Statistical Analysis folder.
268286
"""
287+
start_time = time.time()
269288
try:
270289
corr_matrix = df[['IDHM 2010'
271290
, 'Carga Tributária Municipal 2010'
@@ -328,14 +347,19 @@ def analyze_data(self
328347
with open(report_filename
329348
, 'w') as f:
330349
f.write(html_report)
350+
elapsed_time = time.time() - start_time
351+
logging.info(f"Analyzing data {report_filename} in {elapsed_time:.2f} seconds")
331352
except Exception as e:
332353
logging.error(f'Error analyzing data: {e}')
333354

334355
class Database:
335356
def __init__(self):
336357
"""Create connection to DuckDB database."""
358+
start_time = time.time()
337359
self._install_extensions()
338360
self.conn = ddb.connect('ipea.db')
361+
elapsed_time = time.time() - start_time
362+
logging.info(f"Initialized database in {elapsed_time:.2f} seconds")
339363

340364
def _install_extensions(self):
341365
"""Install extensions to DuckDB database."""
@@ -358,10 +382,13 @@ def fetch_data(self) -> pd.DataFrame:
358382
Returns:
359383
DataFrame: The finished pandas DataFrame.
360384
"""
385+
start_time = time.time()
361386
try:
362387
conn = ddb.connect(self.db_path)
363388
df = conn.execute('SELECT * FROM df').fetchdf()
364-
conn.close()
389+
conn.close()
390+
elapsed_time = time.time() - start_time
391+
logging.info(f"Fetched data in {elapsed_time:.2f} seconds")
365392
return df
366393
except Exception as e:
367394
logging.error(f'Error loading data from DuckDB: {e}')
@@ -374,11 +401,14 @@ def fetch_geodata(self) -> gpd.GeoDataFrame:
374401
Returns:
375402
GeoDataFrame: The finished GeoDataFrame.
376403
"""
404+
start_time = time.time()
377405
try:
378406
gdf = geobr.read_municipality(code_muni = 'all'
379407
, year = 2010)
380408
gdf = gpd.GeoDataFrame(gdf).drop(columns = ['name_muni'
381409
, 'code_state']).rename(columns = {'abbrev_state' : 'UF'})
410+
elapsed_time = time.time() - start_time
411+
logging.info(f"Fetched geodata in {elapsed_time:.2f} seconds")
382412
return gdf
383413
except Exception as e:
384414
logging.error(f'Error fetching geodata: {e}')
@@ -399,15 +429,24 @@ def merge_data(data : pd.DataFrame
399429
Returns:
400430
GeoDataFrame: A GeoDataFrame containing the selected IPEA data.
401431
"""
432+
start_time = time.time()
402433
data.loc[:, 'CodMunIBGE'] = data['CodMunIBGE'].astype(int)
403434
geodata['code_muni'] = geodata['code_muni'].astype(int)
404435
geodata = geodata.rename(columns = {'code_muni' : 'CodMunIBGE'})
405436
app_data = data.merge(geodata
406437
, how = 'left'
407438
, on = 'CodMunIBGE')
408-
app_data = gpd.GeoDataFrame(app_data, geometry='geometry')
409-
file_path = os.path.join(gold_folder, 'AppData.parquet')
410-
app_data.to_parquet(file_path, index=None, compression='snappy', schema_version=None)
439+
app_data = gpd.GeoDataFrame(app_data
440+
, geometry = 'geometry')
441+
app_data['geometry'] = app_data.geometry.simplify(tolerance = 0.01)
442+
file_path = os.path.join(gold_folder
443+
, 'AppData.parquet')
444+
app_data.to_parquet(file_path
445+
, index = None
446+
, compression = 'snappy'
447+
, schema_version = None)
448+
elapsed_time = time.time() - start_time
449+
logging.info(f"Merged data in {elapsed_time:.2f} seconds")
411450
return gpd.GeoDataFrame(app_data)
412451

413452
def main():
@@ -437,9 +476,12 @@ def main():
437476
, ('Municípios', None, 'Municípios.parquet')]
438477

439478
for series, year, filename in data_series:
479+
start_time = time.time()
440480
processor.process_data(series
441481
, year
442482
, filename)
483+
elapsed_time = time.time() - start_time
484+
logging.info(f"Processed data for {filename} in {elapsed_time:.2f} seconds")
443485

444486
r_code = """
445487
install.packages('ipeadatar', repos = 'http://cran.r-project.org')

0 commit comments

Comments
 (0)