Skip to content

Commit cba58fc

Browse files
committed
some test case
1 parent ef3e211 commit cba58fc

13 files changed

Lines changed: 700 additions & 172 deletions

File tree

be/src/core/column/column_file.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class ColumnFile final : public COWHelper<IColumn, ColumnFile> {
4343

4444
Field operator[](size_t n) const override { return (*_data)[n]; }
4545
void get(size_t n, Field& res) const override { _data->get(n, res); }
46+
StringRef get_data_at(size_t n) const override { return _data->get_data_at(n); }
4647

4748
void insert(const Field& x) override;
4849
void insert_from(const IColumn& src, size_t n) override;

be/src/core/data_type/file_schema_descriptor.cpp

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,22 +42,16 @@ FileSchemaDescriptor::FileSchemaDescriptor() {
4242
});
4343
};
4444

45-
add_field(FILE_FIELD_URI.data(), std::make_shared<DataTypeString>(4096, TYPE_VARCHAR));
46-
add_field(FILE_FIELD_FILE_NAME.data(), std::make_shared<DataTypeString>(512, TYPE_VARCHAR));
47-
add_field(FILE_FIELD_CONTENT_TYPE.data(), std::make_shared<DataTypeString>(128, TYPE_VARCHAR));
48-
add_field(FILE_FIELD_SIZE.data(), std::make_shared<DataTypeInt64>());
49-
add_field(FILE_FIELD_REGION.data(),
50-
make_nullable(std::make_shared<DataTypeString>(64, TYPE_VARCHAR)));
51-
add_field(FILE_FIELD_ENDPOINT.data(),
52-
make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
53-
add_field(FILE_FIELD_AK.data(),
54-
make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
55-
add_field(FILE_FIELD_SK.data(),
56-
make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
57-
add_field(FILE_FIELD_ROLE_ARN.data(),
58-
make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
59-
add_field(FILE_FIELD_EXTERNAL_ID.data(),
60-
make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
45+
add_field("uri", std::make_shared<DataTypeString>(4096, TYPE_VARCHAR));
46+
add_field("file_name", std::make_shared<DataTypeString>(512, TYPE_VARCHAR));
47+
add_field("content_type", std::make_shared<DataTypeString>(128, TYPE_VARCHAR));
48+
add_field("size", std::make_shared<DataTypeInt64>());
49+
add_field("region", make_nullable(std::make_shared<DataTypeString>(64, TYPE_VARCHAR)));
50+
add_field("endpoint", make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
51+
add_field("ak", make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
52+
add_field("sk", make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
53+
add_field("role_arn", make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
54+
add_field("external_id", make_nullable(std::make_shared<DataTypeString>(256, TYPE_VARCHAR)));
6155
}
6256

6357
std::optional<size_t> FileSchemaDescriptor::try_get_position(std::string_view name) const {
@@ -144,4 +138,33 @@ void FileSchemaDescriptor::write_jsonb_string(JsonbWriter& writer, const std::st
144138
void FileSchemaDescriptor::write_jsonb_key(JsonbWriter& writer, std::string_view key) {
145139
writer.writeKey(key.data(), cast_set<uint8_t>(key.size()));
146140
}
141+
142+
void FileSchemaDescriptor::write_file_jsonb(JsonbWriter& writer, const FileMetadata& metadata) {
143+
const auto& schema = instance();
144+
auto write_nullable_str = [&](Field field, const std::string& s) {
145+
write_jsonb_key(writer, schema.field_name(field));
146+
if (s.empty()) {
147+
writer.writeNull();
148+
} else {
149+
write_jsonb_string(writer, s);
150+
}
151+
};
152+
153+
writer.writeStartObject();
154+
write_jsonb_key(writer, schema.field_name(Field::URI));
155+
write_jsonb_string(writer, metadata.uri);
156+
write_jsonb_key(writer, schema.field_name(Field::FILE_NAME));
157+
write_jsonb_string(writer, metadata.file_name);
158+
write_jsonb_key(writer, schema.field_name(Field::CONTENT_TYPE));
159+
write_jsonb_string(writer, metadata.content_type);
160+
write_jsonb_key(writer, schema.field_name(Field::SIZE));
161+
writer.writeInt64(metadata.size);
162+
write_nullable_str(Field::REGION, metadata.region);
163+
write_nullable_str(Field::ENDPOINT, metadata.endpoint);
164+
write_nullable_str(Field::AK, metadata.ak);
165+
write_nullable_str(Field::SK, metadata.sk);
166+
write_nullable_str(Field::ROLE_ARN, metadata.role_arn);
167+
write_nullable_str(Field::EXTERNAL_ID, metadata.external_id);
168+
writer.writeEndObject();
169+
}
147170
} // namespace doris

be/src/core/data_type/file_schema_descriptor.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,18 @@ struct FileFieldDesc {
3131
DataTypePtr type;
3232
};
3333

34-
inline constexpr std::string_view FILE_FIELD_URI = "uri";
35-
inline constexpr std::string_view FILE_FIELD_FILE_NAME = "file_name";
36-
inline constexpr std::string_view FILE_FIELD_CONTENT_TYPE = "content_type";
37-
inline constexpr std::string_view FILE_FIELD_SIZE = "size";
38-
inline constexpr std::string_view FILE_FIELD_REGION = "region";
39-
inline constexpr std::string_view FILE_FIELD_ENDPOINT = "endpoint";
40-
inline constexpr std::string_view FILE_FIELD_AK = "ak";
41-
inline constexpr std::string_view FILE_FIELD_SK = "sk";
42-
inline constexpr std::string_view FILE_FIELD_ROLE_ARN = "role_arn";
43-
inline constexpr std::string_view FILE_FIELD_EXTERNAL_ID = "external_id";
34+
struct FileMetadata {
35+
std::string uri;
36+
std::string file_name;
37+
std::string content_type;
38+
int64_t size = 0;
39+
std::string region;
40+
std::string endpoint;
41+
std::string ak;
42+
std::string sk;
43+
std::string role_arn;
44+
std::string external_id;
45+
};
4446

4547
// now struct FileInfo only contains file_name and file_size,
4648
// and if we want to get ETAG and LAST_MODIFIED_AT, need refactor FileInfo and the underlying file system client to support them.
@@ -75,6 +77,8 @@ class FileSchemaDescriptor final {
7577
static std::string extension_to_content_type(const std::string& ext);
7678
static void write_jsonb_string(JsonbWriter& writer, const std::string& value);
7779
static void write_jsonb_key(JsonbWriter& writer, std::string_view key);
80+
// Serializes a FileMetadata into a complete JSONB object.
81+
static void write_file_jsonb(JsonbWriter& writer, const FileMetadata& metadata);
7882

7983
private:
8084
FileSchemaDescriptor();

be/src/exprs/function/function_file.cpp

Lines changed: 94 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818
#include <cstring>
1919
#include <memory>
2020
#include <optional>
21+
#include <sstream>
2122
#include <string>
2223
#include <string_view>
24+
#include <vector>
2325

2426
#include "common/cast_set.h"
2527
#include "common/status.h"
@@ -34,7 +36,10 @@
3436
#include "core/data_type/file_schema_descriptor.h"
3537
#include "exprs/function/function.h"
3638
#include "exprs/function/simple_function_factory.h"
39+
#include "io/fs/obj_storage_client.h"
3740
#include "util/jsonb_writer.h"
41+
#include "util/s3_uri.h"
42+
#include "util/s3_util.h"
3843

3944
namespace doris {
4045

@@ -84,8 +89,47 @@ class FunctionToFile : public IFunction {
8489
std::string content_type =
8590
S::extension_to_content_type(S::extract_file_extension(file_name));
8691

92+
// Ensure endpoint has http:// prefix for S3 SDK.
93+
std::string normalized_endpoint = _normalize_endpoint(endpoint);
94+
std::string region = _infer_region(normalized_endpoint);
95+
96+
// Validate the object exists via HEAD request and get actual size.
97+
S3ClientConf s3_conf;
98+
s3_conf.endpoint = normalized_endpoint;
99+
s3_conf.region = region;
100+
s3_conf.ak = ak;
101+
s3_conf.sk = sk;
102+
auto s3_client = S3ClientFactory::instance().create(s3_conf);
103+
if (!s3_client) {
104+
return Status::InternalError(
105+
"to_file: failed to create S3 client for endpoint '{}'", endpoint);
106+
}
107+
// Normalize oss:// etc. to s3:// for S3URI parser and storage.
108+
std::string normalized_uri = _normalize_uri_scheme(uri);
109+
S3URI s3_uri(normalized_uri);
110+
RETURN_IF_ERROR(s3_uri.parse());
111+
auto head_resp = s3_client->head_object(
112+
{.bucket = s3_uri.get_bucket(), .key = s3_uri.get_key()});
113+
if (head_resp.resp.status.code != 0) {
114+
return Status::InvalidArgument("to_file: object '{}' is not accessible: {}", uri,
115+
head_resp.resp.status.msg);
116+
}
117+
int64_t file_size = head_resp.file_size;
118+
87119
writer.reset();
88-
_write_file_jsonb(writer, schema, uri, file_name, content_type, endpoint, ak, sk);
120+
FileMetadata metadata {
121+
.uri = normalized_uri,
122+
.file_name = file_name,
123+
.content_type = content_type,
124+
.size = file_size,
125+
.region = region,
126+
.endpoint = normalized_endpoint,
127+
.ak = ak,
128+
.sk = sk,
129+
.role_arn = {},
130+
.external_id = {},
131+
};
132+
S::write_file_jsonb(writer, metadata);
89133
jsonb_col.insert_data(writer.getOutput()->getBuffer(), writer.getOutput()->getSize());
90134
}
91135
block.replace_by_position(result, std::move(result_col));
@@ -102,40 +146,55 @@ class FunctionToFile : public IFunction {
102146
return &assert_cast<const ColumnString&>(*holder);
103147
}
104148

105-
static void _write_file_jsonb(JsonbWriter& writer, const FileSchemaDescriptor& schema,
106-
const std::string& uri, const std::string& file_name,
107-
const std::string& content_type,
108-
const std::string& endpoint,
109-
const std::string& ak, const std::string& sk) {
110-
using S = FileSchemaDescriptor;
111-
auto write_nullable_str = [&](S::Field field, const std::string& s) {
112-
S::write_jsonb_key(writer, schema.field_name(field));
113-
if (s.empty()) {
114-
writer.writeNull();
115-
} else {
116-
S::write_jsonb_string(writer, s);
117-
}
118-
};
119-
120-
writer.writeStartObject();
121-
S::write_jsonb_key(writer, schema.field_name(S::Field::URI));
122-
S::write_jsonb_string(writer, uri);
123-
S::write_jsonb_key(writer, schema.field_name(S::Field::FILE_NAME));
124-
S::write_jsonb_string(writer, file_name);
125-
S::write_jsonb_key(writer, schema.field_name(S::Field::CONTENT_TYPE));
126-
S::write_jsonb_string(writer, content_type);
127-
S::write_jsonb_key(writer, schema.field_name(S::Field::SIZE));
128-
writer.writeInt64(-1);
129-
write_nullable_str(S::Field::REGION, "");
130-
write_nullable_str(S::Field::ENDPOINT, endpoint);
131-
write_nullable_str(S::Field::AK, ak);
132-
write_nullable_str(S::Field::SK, sk);
133-
// role_arn and external_id not used in to_file()
134-
S::write_jsonb_key(writer, schema.field_name(S::Field::ROLE_ARN));
135-
writer.writeNull();
136-
S::write_jsonb_key(writer, schema.field_name(S::Field::EXTERNAL_ID));
137-
writer.writeNull();
138-
writer.writeEndObject();
149+
// Ensure endpoint has http:// scheme prefix.
150+
static std::string _normalize_endpoint(const std::string& endpoint) {
151+
if (endpoint.substr(0, 7) == "http://" || endpoint.substr(0, 8) == "https://") {
152+
return endpoint;
153+
}
154+
return "http://" + endpoint;
155+
}
156+
157+
// Normalize oss:// or other S3-compatible schemes to s3:// for S3URI parser.
158+
static std::string _normalize_uri_scheme(const std::string& uri) {
159+
if (uri.substr(0, 6) == "oss://") {
160+
return "s3://" + uri.substr(6);
161+
}
162+
if (uri.substr(0, 6) == "cos://") {
163+
return "s3://" + uri.substr(6);
164+
}
165+
if (uri.substr(0, 6) == "obs://") {
166+
return "s3://" + uri.substr(6);
167+
}
168+
return uri;
169+
}
170+
171+
// Infer region from endpoint.
172+
// oss-cn-shanghai.aliyuncs.com → cn-shanghai
173+
// s3.us-east-1.amazonaws.com → us-east-1
174+
static std::string _infer_region(const std::string& endpoint) {
175+
// Strip scheme
176+
std::string host = endpoint;
177+
if (host.substr(0, 8) == "https://") {
178+
host = host.substr(8);
179+
} else if (host.substr(0, 7) == "http://") {
180+
host = host.substr(7);
181+
}
182+
// Split by '.'
183+
std::vector<std::string> parts;
184+
std::istringstream iss(host);
185+
std::string part;
186+
while (std::getline(iss, part, '.')) {
187+
parts.push_back(part);
188+
}
189+
if (parts.size() < 2) {
190+
return "us-east-1";
191+
}
192+
// OSS: oss-cn-shanghai.aliyuncs.com → first part starts with "oss-"
193+
if (parts[0].find("oss-") == 0) {
194+
return parts[0].substr(4); // strip "oss-" prefix, e.g. "cn-shanghai"
195+
}
196+
// AWS S3: s3.us-east-1.amazonaws.com → second part
197+
return parts[1];
139198
}
140199
};
141200

0 commit comments

Comments
 (0)