Skip to content

Commit fecec43

Browse files
committed
some test case
1 parent ef3e211 commit fecec43

20 files changed

Lines changed: 706 additions & 267 deletions

File tree

be/src/core/column/column_file.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class ColumnFile final : public COWHelper<IColumn, ColumnFile> {
4343

4444
Field operator[](size_t n) const override { return (*_data)[n]; }
4545
void get(size_t n, Field& res) const override { _data->get(n, res); }
46+
StringRef get_data_at(size_t n) const override { return _data->get_data_at(n); }
4647

4748
void insert(const Field& x) override;
4849
void insert_from(const IColumn& src, size_t n) override;

be/src/core/data_type/data_type_file.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,6 @@ void DataTypeFile::to_pb_column_meta(PColumnMeta* col_meta) const {
6565
_physical_type.to_pb_column_meta(col_meta);
6666
}
6767

68-
std::optional<size_t> DataTypeFile::try_get_subfield(std::string_view name) const {
69-
return _schema.try_get_position(name);
70-
}
71-
7268
const DataTypePtr& DataTypeFile::get_subfield_type(size_t idx) const {
7369
return _schema.field(idx).type;
7470
}

be/src/core/data_type/data_type_file.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ class DataTypeFile final : public IDataType {
5858
}
5959
void to_pb_column_meta(PColumnMeta* col_meta) const override;
6060

61-
std::optional<size_t> try_get_subfield(std::string_view name) const;
6261
const DataTypePtr& get_subfield_type(size_t idx) const;
6362
const FileSchemaDescriptor& schema() const { return _schema; }
6463

be/src/core/data_type/file_schema_descriptor.cpp

Lines changed: 39 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -42,31 +42,16 @@ FileSchemaDescriptor::FileSchemaDescriptor() {
4242
});
4343
};
4444

45-
add_field(FILE_FIELD_URI.data(), std::make_shared<DataTypeString>(4096, TYPE_VARCHAR));
46-
add_field(FILE_FIELD_FILE_NAME.data(), std::make_shared<DataTypeString>(512, TYPE_VARCHAR));
47-
add_field(FILE_FIELD_CONTENT_TYPE.data(), std::make_shared<DataTypeString>(128, TYPE_VARCHAR));
48-
add_field(FILE_FIELD_SIZE.data(), std::make_shared<DataTypeInt64>());
49-
add_field(FILE_FIELD_REGION.data(),
50-
make_nullable(std::make_shared<DataTypeString>(64, TYPE_VARCHAR)));
51-
add_field(FILE_FIELD_ENDPOINT.data(),
52-
make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
53-
add_field(FILE_FIELD_AK.data(),
54-
make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
55-
add_field(FILE_FIELD_SK.data(),
56-
make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
57-
add_field(FILE_FIELD_ROLE_ARN.data(),
58-
make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
59-
add_field(FILE_FIELD_EXTERNAL_ID.data(),
60-
make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
61-
}
62-
63-
std::optional<size_t> FileSchemaDescriptor::try_get_position(std::string_view name) const {
64-
for (size_t i = 0; i < _fields.size(); ++i) {
65-
if (name == _fields[i].name) {
66-
return i;
67-
}
68-
}
69-
return std::nullopt;
45+
add_field("uri", std::make_shared<DataTypeString>(4096, TYPE_VARCHAR));
46+
add_field("file_name", std::make_shared<DataTypeString>(512, TYPE_VARCHAR));
47+
add_field("content_type", std::make_shared<DataTypeString>(128, TYPE_VARCHAR));
48+
add_field("size", std::make_shared<DataTypeInt64>());
49+
add_field("region", make_nullable(std::make_shared<DataTypeString>(64, TYPE_VARCHAR)));
50+
add_field("endpoint", make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
51+
add_field("ak", make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
52+
add_field("sk", make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
53+
add_field("role_arn", make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
54+
add_field("external_id", make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
7055
}
7156

7257
std::string FileSchemaDescriptor::extract_file_name(std::string_view uri) {
@@ -144,4 +129,33 @@ void FileSchemaDescriptor::write_jsonb_string(JsonbWriter& writer, const std::st
144129
void FileSchemaDescriptor::write_jsonb_key(JsonbWriter& writer, std::string_view key) {
145130
writer.writeKey(key.data(), cast_set<uint8_t>(key.size()));
146131
}
132+
133+
void FileSchemaDescriptor::write_file_jsonb(JsonbWriter& writer, const FileMetadata& metadata) {
134+
const auto& schema = instance();
135+
auto write_nullable_str = [&](Field field, const std::string& s) {
136+
write_jsonb_key(writer, schema.field_name(field));
137+
if (s.empty()) {
138+
writer.writeNull();
139+
} else {
140+
write_jsonb_string(writer, s);
141+
}
142+
};
143+
144+
writer.writeStartObject();
145+
write_jsonb_key(writer, schema.field_name(Field::URI));
146+
write_jsonb_string(writer, metadata.uri);
147+
write_jsonb_key(writer, schema.field_name(Field::FILE_NAME));
148+
write_jsonb_string(writer, metadata.file_name);
149+
write_jsonb_key(writer, schema.field_name(Field::CONTENT_TYPE));
150+
write_jsonb_string(writer, metadata.content_type);
151+
write_jsonb_key(writer, schema.field_name(Field::SIZE));
152+
writer.writeInt64(metadata.size);
153+
write_nullable_str(Field::REGION, metadata.region);
154+
write_nullable_str(Field::ENDPOINT, metadata.endpoint);
155+
write_nullable_str(Field::AK, metadata.ak);
156+
write_nullable_str(Field::SK, metadata.sk);
157+
write_nullable_str(Field::ROLE_ARN, metadata.role_arn);
158+
write_nullable_str(Field::EXTERNAL_ID, metadata.external_id);
159+
writer.writeEndObject();
160+
}
147161
} // namespace doris

be/src/core/data_type/file_schema_descriptor.h

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,18 @@ struct FileFieldDesc {
3131
DataTypePtr type;
3232
};
3333

34-
inline constexpr std::string_view FILE_FIELD_URI = "uri";
35-
inline constexpr std::string_view FILE_FIELD_FILE_NAME = "file_name";
36-
inline constexpr std::string_view FILE_FIELD_CONTENT_TYPE = "content_type";
37-
inline constexpr std::string_view FILE_FIELD_SIZE = "size";
38-
inline constexpr std::string_view FILE_FIELD_REGION = "region";
39-
inline constexpr std::string_view FILE_FIELD_ENDPOINT = "endpoint";
40-
inline constexpr std::string_view FILE_FIELD_AK = "ak";
41-
inline constexpr std::string_view FILE_FIELD_SK = "sk";
42-
inline constexpr std::string_view FILE_FIELD_ROLE_ARN = "role_arn";
43-
inline constexpr std::string_view FILE_FIELD_EXTERNAL_ID = "external_id";
34+
struct FileMetadata {
35+
std::string uri;
36+
std::string file_name;
37+
std::string content_type;
38+
int64_t size = 0;
39+
std::string region;
40+
std::string endpoint;
41+
std::string ak;
42+
std::string sk;
43+
std::string role_arn;
44+
std::string external_id;
45+
};
4446

4547
// now struct FileInfo only contains file_name and file_size,
4648
// and if we want to get ETAG and LAST_MODIFIED_AT, need refactor FileInfo and the underlying file system client to support them.
@@ -65,7 +67,6 @@ class FileSchemaDescriptor final {
6567
return _fields[static_cast<size_t>(field_id)];
6668
}
6769
std::string_view field_name(Field field_id) const { return field(field_id).name; }
68-
std::optional<size_t> try_get_position(std::string_view name) const;
6970

7071
// Shared utilities for FILE type serialization.
7172
static std::string extract_file_name(std::string_view uri);
@@ -75,6 +76,8 @@ class FileSchemaDescriptor final {
7576
static std::string extension_to_content_type(const std::string& ext);
7677
static void write_jsonb_string(JsonbWriter& writer, const std::string& value);
7778
static void write_jsonb_key(JsonbWriter& writer, std::string_view key);
79+
// Serializes a FileMetadata into a complete JSONB object.
80+
static void write_file_jsonb(JsonbWriter& writer, const FileMetadata& metadata);
7881

7982
private:
8083
FileSchemaDescriptor();

be/src/exprs/function/function_file.cpp

Lines changed: 72 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <optional>
2121
#include <string>
2222
#include <string_view>
23+
#include <vector>
2324

2425
#include "common/cast_set.h"
2526
#include "common/status.h"
@@ -34,7 +35,10 @@
3435
#include "core/data_type/file_schema_descriptor.h"
3536
#include "exprs/function/function.h"
3637
#include "exprs/function/simple_function_factory.h"
38+
#include "io/fs/obj_storage_client.h"
3739
#include "util/jsonb_writer.h"
40+
#include "util/s3_uri.h"
41+
#include "util/s3_util.h"
3842

3943
namespace doris {
4044

@@ -48,25 +52,27 @@ class FunctionToFile : public IFunction {
4852

4953
bool is_variadic() const override { return false; }
5054

51-
size_t get_number_of_arguments() const override { return 4; }
55+
size_t get_number_of_arguments() const override { return 5; }
5256

5357
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
5458
return std::make_shared<DataTypeFile>();
5559
}
5660

5761
Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
5862
uint32_t result, size_t input_rows_count) const override {
59-
DCHECK_EQ(arguments.size(), 4);
63+
DCHECK_EQ(arguments.size(), 5);
6064

61-
ColumnPtr uri_holder, endpoint_holder, ak_holder, sk_holder;
65+
ColumnPtr uri_holder, region_holder, endpoint_holder, ak_holder, sk_holder;
6266
const ColumnString* uri_col =
6367
_unwrap_string_column(block.get_by_position(arguments[0]), uri_holder);
68+
const ColumnString* region_col =
69+
_unwrap_string_column(block.get_by_position(arguments[1]), region_holder);
6470
const ColumnString* endpoint_col =
65-
_unwrap_string_column(block.get_by_position(arguments[1]), endpoint_holder);
71+
_unwrap_string_column(block.get_by_position(arguments[2]), endpoint_holder);
6672
const ColumnString* ak_col =
67-
_unwrap_string_column(block.get_by_position(arguments[2]), ak_holder);
73+
_unwrap_string_column(block.get_by_position(arguments[3]), ak_holder);
6874
const ColumnString* sk_col =
69-
_unwrap_string_column(block.get_by_position(arguments[3]), sk_holder);
75+
_unwrap_string_column(block.get_by_position(arguments[4]), sk_holder);
7076

7177
using S = FileSchemaDescriptor;
7278
const auto& schema = S::instance();
@@ -77,15 +83,54 @@ class FunctionToFile : public IFunction {
7783

7884
for (size_t row = 0; row < input_rows_count; ++row) {
7985
std::string uri = uri_col->get_data_at(row).to_string();
86+
std::string region = region_col->get_data_at(row).to_string();
8087
std::string endpoint = endpoint_col->get_data_at(row).to_string();
8188
std::string ak = ak_col->get_data_at(row).to_string();
8289
std::string sk = sk_col->get_data_at(row).to_string();
8390
std::string file_name = S::extract_file_name(uri);
8491
std::string content_type =
8592
S::extension_to_content_type(S::extract_file_extension(file_name));
8693

94+
// Ensure endpoint has http:// prefix for S3 SDK.
95+
std::string normalized_endpoint = _normalize_endpoint(endpoint);
96+
97+
// Validate the object exists via HEAD request and get actual size.
98+
S3ClientConf s3_conf;
99+
s3_conf.endpoint = normalized_endpoint;
100+
s3_conf.region = region;
101+
s3_conf.ak = ak;
102+
s3_conf.sk = sk;
103+
auto s3_client = S3ClientFactory::instance().create(s3_conf);
104+
if (!s3_client) {
105+
return Status::InternalError(
106+
"to_file: failed to create S3 client for endpoint '{}'", endpoint);
107+
}
108+
// Normalize oss:// etc. to s3:// for S3URI parser and storage.
109+
std::string normalized_uri = _normalize_uri_scheme(uri);
110+
S3URI s3_uri(normalized_uri);
111+
RETURN_IF_ERROR(s3_uri.parse());
112+
auto head_resp = s3_client->head_object(
113+
{.bucket = s3_uri.get_bucket(), .key = s3_uri.get_key()});
114+
if (head_resp.resp.status.code != 0) {
115+
return Status::InvalidArgument("to_file: object '{}' is not accessible: {}", uri,
116+
head_resp.resp.status.msg);
117+
}
118+
int64_t file_size = head_resp.file_size;
119+
87120
writer.reset();
88-
_write_file_jsonb(writer, schema, uri, file_name, content_type, endpoint, ak, sk);
121+
FileMetadata metadata {
122+
.uri = normalized_uri,
123+
.file_name = file_name,
124+
.content_type = content_type,
125+
.size = file_size,
126+
.region = region,
127+
.endpoint = normalized_endpoint,
128+
.ak = ak,
129+
.sk = sk,
130+
.role_arn = {},
131+
.external_id = {},
132+
};
133+
S::write_file_jsonb(writer, metadata);
89134
jsonb_col.insert_data(writer.getOutput()->getBuffer(), writer.getOutput()->getSize());
90135
}
91136
block.replace_by_position(result, std::move(result_col));
@@ -102,40 +147,26 @@ class FunctionToFile : public IFunction {
102147
return &assert_cast<const ColumnString&>(*holder);
103148
}
104149

105-
static void _write_file_jsonb(JsonbWriter& writer, const FileSchemaDescriptor& schema,
106-
const std::string& uri, const std::string& file_name,
107-
const std::string& content_type,
108-
const std::string& endpoint,
109-
const std::string& ak, const std::string& sk) {
110-
using S = FileSchemaDescriptor;
111-
auto write_nullable_str = [&](S::Field field, const std::string& s) {
112-
S::write_jsonb_key(writer, schema.field_name(field));
113-
if (s.empty()) {
114-
writer.writeNull();
115-
} else {
116-
S::write_jsonb_string(writer, s);
117-
}
118-
};
119-
120-
writer.writeStartObject();
121-
S::write_jsonb_key(writer, schema.field_name(S::Field::URI));
122-
S::write_jsonb_string(writer, uri);
123-
S::write_jsonb_key(writer, schema.field_name(S::Field::FILE_NAME));
124-
S::write_jsonb_string(writer, file_name);
125-
S::write_jsonb_key(writer, schema.field_name(S::Field::CONTENT_TYPE));
126-
S::write_jsonb_string(writer, content_type);
127-
S::write_jsonb_key(writer, schema.field_name(S::Field::SIZE));
128-
writer.writeInt64(-1);
129-
write_nullable_str(S::Field::REGION, "");
130-
write_nullable_str(S::Field::ENDPOINT, endpoint);
131-
write_nullable_str(S::Field::AK, ak);
132-
write_nullable_str(S::Field::SK, sk);
133-
// role_arn and external_id not used in to_file()
134-
S::write_jsonb_key(writer, schema.field_name(S::Field::ROLE_ARN));
135-
writer.writeNull();
136-
S::write_jsonb_key(writer, schema.field_name(S::Field::EXTERNAL_ID));
137-
writer.writeNull();
138-
writer.writeEndObject();
150+
// Ensure endpoint has http:// scheme prefix.
151+
static std::string _normalize_endpoint(const std::string& endpoint) {
152+
if (endpoint.substr(0, 7) == "http://" || endpoint.substr(0, 8) == "https://") {
153+
return endpoint;
154+
}
155+
return "http://" + endpoint;
156+
}
157+
158+
// Normalize oss:// etc. to s3:// for S3URI parser and storage.
159+
static std::string _normalize_uri_scheme(const std::string& uri) {
160+
if (uri.substr(0, 6) == "oss://") {
161+
return "s3://" + uri.substr(6);
162+
}
163+
if (uri.substr(0, 6) == "cos://") {
164+
return "s3://" + uri.substr(6);
165+
}
166+
if (uri.substr(0, 6) == "obs://") {
167+
return "s3://" + uri.substr(6);
168+
}
169+
return uri;
139170
}
140171
};
141172

0 commit comments

Comments
 (0)