Skip to content

Commit 4b89707

Browse files
authored
Merge pull request #6675 from Flowminder/parquet_fdw
Parquet fdw
2 parents 76a9b30 + e3d6cf0 commit 4b89707

13 files changed

Lines changed: 267 additions & 39 deletions

CHANGELOG.md

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,18 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
66

77
## [Unreleased]
88

9+
### Added
10+
- Added support for Parquet foreign tables using [parquet_fdw](https://github.com/adjust/parquet_fdw)
11+
12+
### Changed
13+
- FlowKit test and synthetic data now uses parquet foreign tables.
14+
> [!WARNING]
15+
> The location of the parquet files in the container is `/parquet_data`, if you are testing with larger amounts of data you may wish to add an additional bind mount for this location.
16+
17+
### Fixed
18+
19+
### Removed
20+
921
## [1.27.0]
1022

1123
### Added
@@ -19,11 +31,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
1931
- The location inside the container of FlowDB's automatically generated config file has changed to `/flowdb_autoconf/$AUTO_CONFIG_FILE_NAME`.
2032

2133

22-
23-
### Fixed
24-
25-
### Removed
26-
2734
## [1.26.0]
2835

2936
### Changed

docker-compose-syntheticdata.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,4 @@ services:
2929
DISASTER_START: ${DISASTER_START:?Must set DISASTER_START env var}
3030
DISASTER_END: ${DISASTER_END:?Must set DISASTER_END env var}
3131
DISASTER_REGION_PCOD: ${DISASTER_REGION_PCOD:-NPL.1.1_1}
32+

flowdb.Dockerfile

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,25 @@ RUN apt-get update \
8282
&& mv pldebugger /usr/local/src \
8383
&& make -C /usr/local/src/pldebugger \
8484
&& make -C /usr/local/src/pldebugger install \
85-
&& apt-get remove -y libssl-dev \
86-
libkrb5-dev \
87-
build-essential \
88-
git \
85+
&& apt-get remove -y build-essential git \
8986
&& apt purge -y --auto-remove \
9087
&& rm -rf /var/lib/apt/lists/*
9188

89+
# Parquet foreign tables
90+
RUN apt-get update -y && apt-get install -y --no-install-recommends git build-essential ca-certificates lsb-release wget \
91+
&& wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
92+
&& apt-get install -y --no-install-recommends ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
93+
&& rm ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb \
94+
&& apt-get update -y \
95+
&& apt-get install -y --no-install-recommends libarrow-dev libparquet-dev libparquet1600 libarrow1600 \
96+
&& pip3 install pyarrow \
97+
&& git clone --branch pg16-compatibility --single-branch https://github.com/adjust/parquet_fdw.git \
98+
&& mv parquet_fdw /usr/local/src \
99+
&& make -C /usr/local/src/parquet_fdw \
100+
&& make -C /usr/local/src/parquet_fdw install \
101+
&& apt-get remove -y build-essential git wget libarrow-dev libparquet-dev \
102+
&& apt purge -y --auto-remove \
103+
&& rm -rf /var/lib/apt/lists/*
92104

93105

94106
#

flowdb/bin/build/0010_create_extensions.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77

8-
set -e
8+
set -euo pipefail
99

1010
#
1111
# Perform all actions as $POSTGRES_USER
@@ -14,7 +14,7 @@ export PGUSER="$POSTGRES_USER"
1414
EXTENSIONS=('postgis' 'postgis_raster' 'postgis_topology' 'fuzzystrmatch' \
1515
'file_fdw' 'uuid-ossp' 'plpython3u' \
1616
'tsm_system_rows' 'pgrouting' 'pldbgapi' 'pg_median_utils'\
17-
'ogr_fdw' 'tds_fdw' 'btree_gist')
17+
'ogr_fdw' 'tds_fdw' 'btree_gist' 'parquet_fdw')
1818

1919
#
2020
# Create the 'template_postgis' template db
@@ -52,6 +52,7 @@ echo "Creating extension servers in $DB."
5252
psql --dbname="$DB" <<-EOSQL
5353
CREATE SERVER csv_fdw
5454
FOREIGN DATA WRAPPER file_fdw;
55+
CREATE SERVER parquet_srv FOREIGN DATA WRAPPER parquet_fdw;
5556
EOSQL
5657

5758
done

flowdb/testdata/bin/9900_ingest_test_data.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
set -e
88
export PGUSER="$POSTGRES_USER"
9+
SKIP_TEST_QA_CHECK=${SKIP_TEST_QA_CHECK:-"false"}
910

1011
#
1112
# Ingest test data.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/sh
2+
# This Source Code Form is subject to the terms of the Mozilla Public
3+
# License, v. 2.0. If a copy of the MPL was not distributed with this
4+
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
5+
6+
7+
set -e
8+
export PGUSER="$POSTGRES_USER"
9+
10+
#
11+
# Converts Python .
12+
#
13+
# Note that the only purpose of this script is to
14+
# call the Python script which does the actual data
15+
# data generation, but we need this shell script as
16+
# a wrapper because the PostgreSQL entrypoint script
17+
# does not pick up .py files on its own.
18+
#
19+
20+
export DIR=/docker-entrypoint-initdb.d/py/testdata
21+
22+
echo "Running Python script to convert events tables to Parquet partitions."
23+
pipenv run python ${DIR}/zz_convert_events_to_parquet.py

flowdb/testdata/synthetic_data/Pipfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ tohu = "==0.6.7"
1212
numpy = "<=2.0.0" # Tohu uses float division where it should be using int division, and hence passes a float where numpy expects an int (https://github.com/maxalbert/tohu/blob/3adf0c58b13ef1e1d716d7d613484d2adc58fb60/tohu/v6/primitive_generators.py#L335)
1313
# This used to work, but doesn't as of numpy 1.26.0 (although I haven't managed to track down the relevant change or find a corresponding issue or changelog entry)
1414
jinja2 = "*"
15+
pyarrow = "*"
16+
1517
[dev-packages]
1618
black = {extras = ["jupyter"],version = "==24.4.2"}
1719

flowdb/testdata/synthetic_data/Pipfile.lock

Lines changed: 61 additions & 19 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)