Skip to content

Commit 7659672

Browse files
Merge pull request #33 from Kpler/feat/PTFM-9207/delegate-kafka-schema-generation-to-project
refactor(kafka): delegate the JSON schema generation to the project
2 parents c646b44 + eb670cc commit 7659672

1 file changed

Lines changed: 81 additions & 99 deletions

File tree

kafka/check-local-schemas.sh

Lines changed: 81 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,26 @@ set -o errexit # Leave immediately if a command returns an error
55
set -o nounset # Leave immediately if an unitialized value is used
66
set -o pipefail # Leave immediately if a command fails in a pipe
77

8+
shopt -s nullglob
9+
810
[[ "${BASH_VERSION}" =~ ^(5|4\.[0-9]).* ]] && shopt -s inherit_errexit
911

10-
SCRIPT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
1112

1213
#####################################################################
1314
# Helper functions
1415
#####################################################################
1516

16-
error() {
17+
fatal() {
1718
local msg="$1" exit_code="${2:-1}"
18-
echo "ERROR: ${msg}">&2
19+
echo "FATAL: ${msg}">&2
1920
exit "${exit_code}"
2021
}
2122

23+
error() {
24+
local msg="$1"
25+
echo "ERROR: ${msg}">&2
26+
}
27+
2228
check_binary_exists() {
2329
local binary="$1"
2430
command -v "${binary}" &>/dev/null || error "${binary} is required but it's not installed"
@@ -36,99 +42,65 @@ get_repository_url() {
3642
git remote get-url origin
3743
}
3844

39-
get_md5sum() {
40-
local file="$1"
41-
md5sum "${file}" | awk '{ print $1}'
45+
md5sum_files() {
46+
[[ -z "$*" ]] || md5sum "$@"
4247
}
4348

44-
find_schema_class_file() {
45-
# The schema class heuristic is a bit hacky for now, we try to find a file
46-
# where a class has been annotated with the schema annotation
47-
# Otherwise we fallback on finding the filename containing the schema code
48-
# to end with Schema or is named InputModel
49-
# We might want to improve this in the future
50-
schema_class_file="$(grep -lr "^@schema" src | head -n 1 || return 0)"
51-
52-
if [[ -z "${schema_class_file}" ]]; then
53-
schema_class_file="$(find src -name "*Schema.scala" -o -name "InputModel.scala" | head -n 1 || return 0)"
54-
fi
55-
56-
echo "${schema_class_file}"
5749

50+
get_md5sum() {
51+
local file="$1" checksums="$2"
52+
awk -v file="${file}" '$2 == file { print $1 }' <<< "${checksums}"
5853
}
5954

60-
find_schema_class() {
61-
local schema_class_file="$1"
62-
schema_class_name="$(basename "${schema_class_file}" .scala)"
63-
schema_package="$(awk ' $1 == "package" { print $2 }' "${schema_class_file}")"
64-
65-
echo "${schema_package}.${schema_class_name}"
55+
detect_current_project_language() {
56+
if [[ -n "${PROJECT_LANGUAGE:-}" ]]; then
57+
echo "${PROJECT_LANGUAGE}"
58+
elif [[ -f "build.sbt" ]]; then
59+
echo "scala"
60+
else
61+
echo "unknown"
62+
fi
6663
}
6764

68-
is_library_used() {
69-
local library="$1" candidate_class_file="$2"
65+
fix_end_of_file() {
66+
local file="$1"
67+
[[ $(tail -c1 "${file}") == "" ]] || echo >> "${file}"
68+
}
7069

71-
# if the library is not directly found in the candidate class file
72-
# we fallback on checking the build.sbt file itself
73-
# This doesn't fully protect against from indirect library loading
74-
# but it's a good enough heuristic for now
75-
for candidate in "${candidate_class_file}" build.sbt; do
76-
if grep -q -E "[^#]*${library}" "${candidate}"; then
77-
return 0
78-
fi
70+
fix_kafka_schemas_end_of_file() {
71+
for schema_file in $(find_schema_files); do
72+
fix_end_of_file "${schema_file}"
7973
done
80-
return 1
8174
}
8275

83-
find_avro_library() {
84-
local schema_class_file="$1"
85-
86-
if is_library_used "com.sksamuel.avro4s" "${schema_class_file}"; then
87-
echo "avro4s"
88-
elif is_library_used "vulcan" "${schema_class_file}"; then
89-
echo "vulcan"
90-
else
91-
error "Could not find any avro library import in ${schema_class_file}"
92-
fi
93-
76+
find_schema_files() {
77+
find schemas -type f -name '*.avsc' | sort
9478
}
9579

96-
generate_schema_generator_code() {
97-
local schema_class="$1" schema_library="$2"
98-
99-
schema_class_name="${schema_class##*.}"
100-
schema_package="${schema_class%.*}"
101-
102-
# only schema class using vulcan are supported for now
103-
# but we might add support for avro4s in the future
104-
sed \
105-
-e "s/__SCHEMA_CLASS_NAME__/${schema_class_name}/g" \
106-
-e "s/__SCHEMA_PACKAGE__/${schema_package}/g" \
107-
"${SCRIPT_DIR}/generators/${schema_library^}SchemaGenerator.tmpl.scala"
80+
find_obsolete_schema_files() {
81+
local date="$1"
82+
find schemas -type f -name '*.avsc' -not -newermt "${date}"
10883
}
10984

110-
run_schema_generator_code() {
111-
local generator_code_file="$1" target_schema_file="$2"
112-
113-
generator_source_folder="$(dirname "${generator_code_file}")"
114-
115-
sbt_command=""
116-
# When fork is enabled, it seems we can't avoid all sbt logs to be printed
117-
# so we just disable it
118-
sbt_command+="set fork := false;"
119-
# We tell sbt to look for our generator code in the temporary folder in addition
120-
# to the existing source code, so we can run our generator code alongside the existing code
121-
# We need that as the generator code import the schema class
122-
sbt_command+="set Compile / unmanagedSourceDirectories += file(\"${generator_source_folder}\");"
123-
# Dynamically add the required dependencies to the build.sbt file
124-
sbt_command+="set libraryDependencies += \"com.lihaoyi\" %% \"upickle\" % \"3.1.3\";"
125-
sbt_command+="set libraryDependencies += \"com.lihaoyi\" %% \"os-lib\" % \"0.9.1\";"
126-
127-
sbt_command+="runMain kp_pre_commit_hooks.generateSchemaFile ${target_schema_file}"
85+
generate_kafka_schemas_for_scala() {
86+
if ! sbt "tasks -V" | grep -qE "^ *generateKafkaSchemas "; then
87+
error "The project does not have a sbt generateKafkaSchemas task"
88+
fi
89+
sbt -batch -error "set fork := false; generateKafkaSchemas"
90+
}
12891

129-
sbt -batch -error "${sbt_command}"
130-
# Add a last linefeed to make pre-commit end-of-line fixer happy
131-
echo >> "${target_schema_file}"
92+
run_schema_generation_task() {
93+
local language="$1"
94+
case "${language}" in
95+
scala)
96+
check_binary_exists "sbt"
97+
generate_kafka_schemas_for_scala
98+
fix_kafka_schemas_end_of_file
99+
;;
100+
*)
101+
error "Unsupported language: ${language}"
102+
;;
103+
esac
132104
}
133105

134106
#####################################################################
@@ -137,32 +109,42 @@ run_schema_generator_code() {
137109

138110
trap clean_temporary_folder EXIT
139111

140-
# We don't want to run on template repositories
141-
[[ "$(get_repository_url)" != "git@github.com:Kpler/template-"* ]] || exit 0
112+
language="$(detect_current_project_language)"
113+
114+
before_schema_generation="$(date --date='-1 second' +'%Y-%m-%d %H:%M:%S')"
142115

143-
check_binary_exists "sbt"
116+
# shellcheck disable=SC2046
117+
schema_md5sum_before="$(md5sum_files $(find_schema_files))"
144118

145-
target_schema_file="schemas/schema.avsc"
119+
run_schema_generation_task "${language}"
146120

147-
generator_source_folder="$(mktemp -d)"
148-
generator_code_file="${generator_source_folder}/SchemaGenerator.scala"
121+
schema_files_generated=$(find_schema_files)
122+
[[ -n "${schema_files_generated}" ]] || fatal "No schema files found were generated"
149123

150-
[[ ! -f "${target_schema_file}" ]] || checksum_before="$(get_md5sum "${target_schema_file}")"
124+
# shellcheck disable=SC2086
125+
schema_md5sum_after="$(md5sum_files ${schema_files_generated})"
151126

152-
schema_class_file="$(find_schema_class_file)"
153-
[[ -n "${schema_class_file}" ]] || error "Could not find any schema class file"
127+
error_found="false"
154128

155-
schema_class="$(find_schema_class "${schema_class_file}")"
156-
schema_library="$(find_avro_library "${schema_class_file}")"
129+
for schema_file in ${schema_files_generated}; do
130+
if ! is_git_tracked "${schema_file}"; then
131+
error "Schema file \"${schema_file}\" is not tracked by git. Please commit it."
132+
error_found="true"
133+
fi
157134

158-
generate_schema_generator_code "${schema_class}" "${schema_library}" > "${generator_code_file}"
159-
run_schema_generator_code "${generator_code_file}" "${target_schema_file}"
135+
checksum_after="$(get_md5sum "${schema_file}" "${schema_md5sum_after}")"
136+
checksum_before="$(get_md5sum "${schema_file}" "${schema_md5sum_before}")"
137+
if [[ "${checksum_after}" != "${checksum_before}" ]]; then
138+
error "Schema file \"${schema_file}\" is not consistent with code. Please commit the updated version."
139+
error_found="true"
140+
fi
141+
done
160142

161-
if ! is_git_tracked "${target_schema_file}"; then
162-
error "Schema file \"${target_schema_file}\" is not tracked by git. Please commit it."
143+
obsolete_schemas_files=$(find_obsolete_schema_files "${before_schema_generation}")
144+
if [[ -n "${obsolete_schemas_files}" ]]; then
145+
error "The following schema files seem obsolete: ${obsolete_schemas_files}. Please delete them."
146+
error_found="true"
163147
fi
164148

165-
checksum_after="$(get_md5sum "${target_schema_file}")"
166-
if [[ "${checksum_after}" != "${checksum_before:-}" ]]; then
167-
error "Schema file \"${target_schema_file}\" was missing or not consistent with code. Please commit the updated version."
168-
fi
149+
[[ "${error_found}" == "false" ]] || exit 1
150+

0 commit comments

Comments
 (0)