Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions app/services/active_storage/staging_bucket_backfill.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
require 'aws-sdk-s3'
require 'cgi'
require 'set'

module ActiveStorage
class StagingBucketBackfill
DEFAULT_DB_BATCH_SIZE = 1_000

Result = Struct.new(
:scanned_blob_keys_count,
:missing_in_staging_count,
:copied_objects_count,
:missing_in_production_count,
:dry_run,
keyword_init: true
)

def self.call(...)
new(...).call
end

def initialize(
s3_client:,
source_bucket_name:,
destination_bucket_name:,
blob_scope: ::ActiveStorage::Blob.all,
dry_run: true,
db_batch_size: DEFAULT_DB_BATCH_SIZE,
logger: Rails.logger
)
@s3_client = s3_client
@source_bucket_name = source_bucket_name
@destination_bucket_name = destination_bucket_name
@blob_scope = blob_scope
@dry_run = dry_run
@db_batch_size = db_batch_size
@logger = logger
end

def call
ensure_supported_environment!

# The one-off restore repair should avoid one S3 existence check per blob.
# Listing the destination bucket once and diffing against DB keys keeps
# the repair cost proportional to bucket pages plus actual copies.
destination_keys = load_bucket_keys(destination_bucket_name)
scanned_blob_keys_count = 0
missing_in_staging_count = 0
copied_objects_count = 0
missing_in_production_count = 0

blob_scope.select(:key).in_batches(of: db_batch_size) do |relation|
relation.pluck(:key).each do |key|
scanned_blob_keys_count += 1
next if destination_keys.include?(key)

missing_in_staging_count += 1

if dry_run
logger.info "ActiveStorage staging bucket backfill dry run would copy key=#{key}"
next
end

copied_objects_count += copy_key_from_production(key)
rescue Aws::S3::Errors::NoSuchKey
missing_in_production_count += 1
logger.warn "ActiveStorage staging bucket backfill could not find production key=#{key}"
end
end

Result.new(
scanned_blob_keys_count:,
missing_in_staging_count:,
copied_objects_count:,
missing_in_production_count:,
dry_run:
)
end

private

attr_reader :blob_scope, :db_batch_size, :destination_bucket_name, :dry_run,
:logger, :s3_client, :source_bucket_name

def ensure_supported_environment!
# This repair copies production-backed files into staging to match a
# restored staging DB. Restricting it to staging prevents accidental
# cross-environment copying in production.
return if Rails.env.staging?

raise "#{self.class.name} only supports staging"
end

def load_bucket_keys(bucket_name)
keys = Set.new
continuation_token = nil

loop do
response = s3_client.list_objects_v2(
bucket: bucket_name,
continuation_token:
)

response.contents.each { |object| keys.add(object.key) }
break unless response.is_truncated

continuation_token = response.next_continuation_token
end

keys
end

def copy_key_from_production(key)
s3_client.copy_object(
bucket: destination_bucket_name,
key:,
copy_source: "#{source_bucket_name}/#{CGI.escape(key).gsub('+', '%20')}"
)

logger.info "ActiveStorage staging bucket backfill copied key=#{key}"
1
end
end
end
147 changes: 147 additions & 0 deletions app/services/active_storage/staging_bucket_cleanup.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
require 'aws-sdk-s3'
require 'set'

module ActiveStorage
class StagingBucketCleanup
DEFAULT_DELETE_BATCH_SIZE = 1_000
DEFAULT_DB_BATCH_SIZE = 1_000

Result = Struct.new(
:scanned_objects_count,
:orphaned_objects_count,
:deleted_objects_count,
:dry_run,
keyword_init: true
)

def self.call(...)
new(...).call
end

def initialize(
s3_client:,
bucket_name:,
blob_scope: ::ActiveStorage::Blob.all,
dry_run: true,
delete_batch_size: DEFAULT_DELETE_BATCH_SIZE,
db_batch_size: DEFAULT_DB_BATCH_SIZE,
delete_only_if_last_modified_before: nil,
logger: Rails.logger
)
@s3_client = s3_client
@bucket_name = bucket_name
@blob_scope = blob_scope
@dry_run = dry_run
@delete_batch_size = delete_batch_size
@db_batch_size = db_batch_size
@delete_only_if_last_modified_before = delete_only_if_last_modified_before
@logger = logger
end

def call
ensure_supported_environment!

# We compare the staging bucket against the current staging DB because a
# production DB restore can leave behind objects that Rails no longer
# knows about. Cleaning from the DB view avoids one S3 existence check per
# blob, which keeps the restore-time task cheap enough to run routinely.
valid_blob_keys = load_valid_blob_keys
orphan_keys_to_delete = []
scanned_objects_count = 0
orphaned_objects_count = 0
deleted_objects_count = 0

each_bucket_object do |object|
scanned_objects_count += 1
next if valid_blob_keys.include?(object.key)
next if delete_only_if_last_modified_before.present? &&
object.last_modified >= delete_only_if_last_modified_before

orphan_keys_to_delete << object.key
orphaned_objects_count += 1

next unless orphan_keys_to_delete.size >= delete_batch_size

deleted_objects_count += delete_keys(orphan_keys_to_delete)
orphan_keys_to_delete.clear
end

deleted_objects_count += delete_keys(orphan_keys_to_delete) if orphan_keys_to_delete.any?

Result.new(
scanned_objects_count:,
orphaned_objects_count:,
deleted_objects_count:,
dry_run:
)
end

private

attr_reader :blob_scope, :bucket_name, :db_batch_size, :delete_batch_size,
:delete_only_if_last_modified_before, :dry_run, :logger, :s3_client

def ensure_supported_environment!
# This cleanup is intentionally limited to staging
# because its job is to discard bucket objects that are no longer
# represented in the current DB snapshot. That is appropriate for staging
# restore hygiene, but it would be too destructive to allow elsewhere.
return if Rails.env.staging?

raise "#{self.class.name} only supports staging"
end

def load_valid_blob_keys
keys = Set.new

blob_scope.select(:key).in_batches(of: db_batch_size) do |relation|
relation.pluck(:key).each { |key| keys.add(key) }
end

keys
end

def each_bucket_object
continuation_token = nil

loop do
response = s3_client.list_objects_v2(
bucket: bucket_name,
continuation_token:
)

response.contents.each { |object| yield object }

break unless response.is_truncated

continuation_token = response.next_continuation_token
end
end

def delete_keys(keys)
if dry_run
keys.each do |key|
# Dry-run output needs to show the exact candidate keys so operators
# can validate the deletion set before any irreversible cleanup.
logger.info "ActiveStorage staging bucket cleanup dry run would delete key=#{key}"
end
logger.info "ActiveStorage staging bucket cleanup dry run would delete #{keys.size} objects"
return 0
end

# S3 supports deleting up to 1,000 objects per request. Batching on that
# boundary keeps request volume low without trying to issue a single huge
# delete operation that S3 would reject.
s3_client.delete_objects(
bucket: bucket_name,
delete: {
objects: keys.map { |key| { key: } },
quiet: true
}
)

logger.info "ActiveStorage staging bucket cleanup deleted #{keys.size} objects"
keys.size
end
end
end
Loading