Skip to content

Commit eb670cc

Browse files
committed
refactor(kafka): delegate the JSON schema generation to the project
The automatic schema generation heuristic was nice for simple cases but has several drawbacks:: - it would not work as soon as devs are a bit more creative in their code (uses parametrized...). It's difficult to anticipate and we should never underestimate devs creativity. - if it doesn't work, devs are not really autonomous to debug in case it doesn't work as expected in their project since the code runs in the CI and they are less likely to look at the mix of shell script and templated scala code used in the GitHub action. For all these reasons, the code generation for scala now only calls a standardized sbt target that is expected to be implemented in projects. It will be implemented out of the box in template projects and devs are free to change it when they use a different implementation.
1 parent c646b44 commit eb670cc

1 file changed

Lines changed: 81 additions & 99 deletions

File tree

kafka/check-local-schemas.sh

Lines changed: 81 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,26 @@ set -o errexit # Leave immediately if a command returns an error
55
set -o nounset # Leave immediately if an unitialized value is used
66
set -o pipefail # Leave immediately if a command fails in a pipe
77

8+
shopt -s nullglob
9+
810
[[ "${BASH_VERSION}" =~ ^(5|4\.[0-9]).* ]] && shopt -s inherit_errexit
911

10-
SCRIPT_DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
1112

1213
#####################################################################
1314
# Helper functions
1415
#####################################################################
1516

16-
error() {
17+
fatal() {
1718
local msg="$1" exit_code="${2:-1}"
18-
echo "ERROR: ${msg}">&2
19+
echo "FATAL: ${msg}">&2
1920
exit "${exit_code}"
2021
}
2122

23+
error() {
24+
local msg="$1"
25+
echo "ERROR: ${msg}">&2
26+
}
27+
2228
check_binary_exists() {
2329
local binary="$1"
2430
command -v "${binary}" &>/dev/null || error "${binary} is required but it's not installed"
@@ -36,99 +42,65 @@ get_repository_url() {
3642
git remote get-url origin
3743
}
3844

39-
get_md5sum() {
40-
local file="$1"
41-
md5sum "${file}" | awk '{ print $1}'
45+
md5sum_files() {
46+
[[ -z "$*" ]] || md5sum "$@"
4247
}
4348

44-
find_schema_class_file() {
45-
# The schema class heuristic is a bit hacky for now, we try to find a file
46-
# where a class has been annotated with the schema annotation
47-
# Otherwise we fallback on finding the filename containing the schema code
48-
# to end with Schema or is named InputModel
49-
# We might want to improve this in the future
50-
schema_class_file="$(grep -lr "^@schema" src | head -n 1 || return 0)"
51-
52-
if [[ -z "${schema_class_file}" ]]; then
53-
schema_class_file="$(find src -name "*Schema.scala" -o -name "InputModel.scala" | head -n 1 || return 0)"
54-
fi
55-
56-
echo "${schema_class_file}"
5749

50+
get_md5sum() {
51+
local file="$1" checksums="$2"
52+
awk -v file="${file}" '$2 == file { print $1 }' <<< "${checksums}"
5853
}
5954

60-
find_schema_class() {
61-
local schema_class_file="$1"
62-
schema_class_name="$(basename "${schema_class_file}" .scala)"
63-
schema_package="$(awk ' $1 == "package" { print $2 }' "${schema_class_file}")"
64-
65-
echo "${schema_package}.${schema_class_name}"
55+
detect_current_project_language() {
56+
if [[ -n "${PROJECT_LANGUAGE:-}" ]]; then
57+
echo "${PROJECT_LANGUAGE}"
58+
elif [[ -f "build.sbt" ]]; then
59+
echo "scala"
60+
else
61+
echo "unknown"
62+
fi
6663
}
6764

68-
is_library_used() {
69-
local library="$1" candidate_class_file="$2"
65+
fix_end_of_file() {
66+
local file="$1"
67+
[[ $(tail -c1 "${file}") == "" ]] || echo >> "${file}"
68+
}
7069

71-
# if the library is not directly found in the candidate class file
72-
# we fallback on checking the build.sbt file itself
73-
# This doesn't fully protect against from indirect library loading
74-
# but it's a good enough heuristic for now
75-
for candidate in "${candidate_class_file}" build.sbt; do
76-
if grep -q -E "[^#]*${library}" "${candidate}"; then
77-
return 0
78-
fi
70+
fix_kafka_schemas_end_of_file() {
71+
for schema_file in $(find_schema_files); do
72+
fix_end_of_file "${schema_file}"
7973
done
80-
return 1
8174
}
8275

83-
find_avro_library() {
84-
local schema_class_file="$1"
85-
86-
if is_library_used "com.sksamuel.avro4s" "${schema_class_file}"; then
87-
echo "avro4s"
88-
elif is_library_used "vulcan" "${schema_class_file}"; then
89-
echo "vulcan"
90-
else
91-
error "Could not find any avro library import in ${schema_class_file}"
92-
fi
93-
76+
find_schema_files() {
77+
find schemas -type f -name '*.avsc' | sort
9478
}
9579

96-
generate_schema_generator_code() {
97-
local schema_class="$1" schema_library="$2"
98-
99-
schema_class_name="${schema_class##*.}"
100-
schema_package="${schema_class%.*}"
101-
102-
# only schema class using vulcan are supported for now
103-
# but we might add support for avro4s in the future
104-
sed \
105-
-e "s/__SCHEMA_CLASS_NAME__/${schema_class_name}/g" \
106-
-e "s/__SCHEMA_PACKAGE__/${schema_package}/g" \
107-
"${SCRIPT_DIR}/generators/${schema_library^}SchemaGenerator.tmpl.scala"
80+
find_obsolete_schema_files() {
81+
local date="$1"
82+
find schemas -type f -name '*.avsc' -not -newermt "${date}"
10883
}
10984

110-
run_schema_generator_code() {
111-
local generator_code_file="$1" target_schema_file="$2"
112-
113-
generator_source_folder="$(dirname "${generator_code_file}")"
114-
115-
sbt_command=""
116-
# When fork is enabled, it seems we can't avoid all sbt logs to be printed
117-
# so we just disable it
118-
sbt_command+="set fork := false;"
119-
# We tell sbt to look for our generator code in the temporary folder in addition
120-
# to the existing source code, so we can run our generator code alongside the existing code
121-
# We need that as the generator code import the schema class
122-
sbt_command+="set Compile / unmanagedSourceDirectories += file(\"${generator_source_folder}\");"
123-
# Dynamically add the required dependencies to the build.sbt file
124-
sbt_command+="set libraryDependencies += \"com.lihaoyi\" %% \"upickle\" % \"3.1.3\";"
125-
sbt_command+="set libraryDependencies += \"com.lihaoyi\" %% \"os-lib\" % \"0.9.1\";"
126-
127-
sbt_command+="runMain kp_pre_commit_hooks.generateSchemaFile ${target_schema_file}"
85+
generate_kafka_schemas_for_scala() {
86+
if ! sbt "tasks -V" | grep -qE "^ *generateKafkaSchemas "; then
87+
error "The project does not have a sbt generateKafkaSchemas task"
88+
fi
89+
sbt -batch -error "set fork := false; generateKafkaSchemas"
90+
}
12891

129-
sbt -batch -error "${sbt_command}"
130-
# Add a last linefeed to make pre-commit end-of-line fixer happy
131-
echo >> "${target_schema_file}"
92+
run_schema_generation_task() {
93+
local language="$1"
94+
case "${language}" in
95+
scala)
96+
check_binary_exists "sbt"
97+
generate_kafka_schemas_for_scala
98+
fix_kafka_schemas_end_of_file
99+
;;
100+
*)
101+
error "Unsupported language: ${language}"
102+
;;
103+
esac
132104
}
133105

134106
#####################################################################
@@ -137,32 +109,42 @@ run_schema_generator_code() {
137109

138110
trap clean_temporary_folder EXIT
139111

140-
# We don't want to run on template repositories
141-
[[ "$(get_repository_url)" != "git@github.com:Kpler/template-"* ]] || exit 0
112+
language="$(detect_current_project_language)"
113+
114+
before_schema_generation="$(date --date='-1 second' +'%Y-%m-%d %H:%M:%S')"
142115

143-
check_binary_exists "sbt"
116+
# shellcheck disable=SC2046
117+
schema_md5sum_before="$(md5sum_files $(find_schema_files))"
144118

145-
target_schema_file="schemas/schema.avsc"
119+
run_schema_generation_task "${language}"
146120

147-
generator_source_folder="$(mktemp -d)"
148-
generator_code_file="${generator_source_folder}/SchemaGenerator.scala"
121+
schema_files_generated=$(find_schema_files)
122+
[[ -n "${schema_files_generated}" ]] || fatal "No schema files found were generated"
149123

150-
[[ ! -f "${target_schema_file}" ]] || checksum_before="$(get_md5sum "${target_schema_file}")"
124+
# shellcheck disable=SC2086
125+
schema_md5sum_after="$(md5sum_files ${schema_files_generated})"
151126

152-
schema_class_file="$(find_schema_class_file)"
153-
[[ -n "${schema_class_file}" ]] || error "Could not find any schema class file"
127+
error_found="false"
154128

155-
schema_class="$(find_schema_class "${schema_class_file}")"
156-
schema_library="$(find_avro_library "${schema_class_file}")"
129+
for schema_file in ${schema_files_generated}; do
130+
if ! is_git_tracked "${schema_file}"; then
131+
error "Schema file \"${schema_file}\" is not tracked by git. Please commit it."
132+
error_found="true"
133+
fi
157134

158-
generate_schema_generator_code "${schema_class}" "${schema_library}" > "${generator_code_file}"
159-
run_schema_generator_code "${generator_code_file}" "${target_schema_file}"
135+
checksum_after="$(get_md5sum "${schema_file}" "${schema_md5sum_after}")"
136+
checksum_before="$(get_md5sum "${schema_file}" "${schema_md5sum_before}")"
137+
if [[ "${checksum_after}" != "${checksum_before}" ]]; then
138+
error "Schema file \"${schema_file}\" is not consistent with code. Please commit the updated version."
139+
error_found="true"
140+
fi
141+
done
160142

161-
if ! is_git_tracked "${target_schema_file}"; then
162-
error "Schema file \"${target_schema_file}\" is not tracked by git. Please commit it."
143+
obsolete_schemas_files=$(find_obsolete_schema_files "${before_schema_generation}")
144+
if [[ -n "${obsolete_schemas_files}" ]]; then
145+
error "The following schema files seem obsolete: ${obsolete_schemas_files}. Please delete them."
146+
error_found="true"
163147
fi
164148

165-
checksum_after="$(get_md5sum "${target_schema_file}")"
166-
if [[ "${checksum_after}" != "${checksum_before:-}" ]]; then
167-
error "Schema file \"${target_schema_file}\" was missing or not consistent with code. Please commit the updated version."
168-
fi
149+
[[ "${error_found}" == "false" ]] || exit 1
150+

0 commit comments

Comments
 (0)