-
Notifications
You must be signed in to change notification settings - Fork 13
Add write buffering to reduce IOPS from small sequential writes #65
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 7 commits
ead5ab4
026f026
7fec20a
439bbf0
9600623
a665e72
421bbee
b830dd6
c35c4d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,3 +30,10 @@ | |
| *.exe | ||
| *.out | ||
| *.app | ||
|
|
||
| # Build directories | ||
| build/ | ||
| CMakeFiles/ | ||
| CMakeCache.txt | ||
| cmake_install.cmake | ||
| Makefile | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -133,7 +133,7 @@ class ErrorSentry | |
| }; | ||
|
|
||
|
|
||
| MultiuserFile::MultiuserFile(const char *user, std::unique_ptr<XrdOssDF> ossDF, XrdSysError &log, mode_t umask_mode, bool checksum_on_write, unsigned digests, MultiuserFileSystem *oss) : | ||
| MultiuserFile::MultiuserFile(const char *user, std::unique_ptr<XrdOssDF> ossDF, XrdSysError &log, mode_t umask_mode, bool checksum_on_write, unsigned digests, MultiuserFileSystem *oss, size_t write_buffer_size) : | ||
| XrdOssDF(user), | ||
| m_wrapped(std::move(ossDF)), | ||
| m_log(log), | ||
|
|
@@ -142,7 +142,10 @@ MultiuserFile::MultiuserFile(const char *user, std::unique_ptr<XrdOssDF> ossDF, | |
| m_nextoff(0), | ||
| m_oss(oss), | ||
| m_checksum_on_write(checksum_on_write), | ||
| m_digests(digests) | ||
| m_digests(digests), | ||
| m_write_buffer_size(write_buffer_size), | ||
| m_buffer_offset(-1), | ||
| m_buffering_enabled(write_buffer_size > 0) | ||
| {} | ||
|
|
||
| int MultiuserFile::Open(const char *path, int Oflag, mode_t Mode, XrdOucEnv &env) | ||
|
|
@@ -173,28 +176,143 @@ int MultiuserFile::Open(const char *path, int Oflag, mode_t Mode, XrdOucEnv | |
|
|
||
| ssize_t MultiuserFile::Write(const void *buffer, off_t offset, size_t size) | ||
| { | ||
| // Lock protects buffer state and sequential write tracking. | ||
| // While this serializes writes to the same file, it ensures correctness | ||
| // and is consistent with the sequential write assumption. | ||
| std::lock_guard<std::mutex> lock(m_buffer_mutex); | ||
|
|
||
| if ((offset != m_nextoff) && m_state) | ||
| // Check for out-of-order writes if checksumming or buffering | ||
| if ((offset != m_nextoff) && (m_state || m_buffering_enabled)) | ||
| { | ||
| std::stringstream ss; | ||
| ss << "Out-of-order writes not supported while running checksum. " << m_fname; | ||
| m_log.Emsg("Write", ss.str().c_str()); | ||
| return -ENOTSUP; | ||
| // Flush any buffered data first | ||
| if (m_buffering_enabled) { | ||
| int flush_result = FlushWriteBuffer(); | ||
| if (flush_result < 0) { | ||
| return flush_result; | ||
| } | ||
| // Disable buffering for the rest of this file | ||
| m_buffering_enabled = false; | ||
| } | ||
|
|
||
| if (m_state) { | ||
| std::stringstream ss; | ||
| ss << "Out-of-order writes not supported while running checksum. " << m_fname; | ||
| m_log.Emsg("Write", ss.str().c_str()); | ||
| return -ENOTSUP; | ||
| } | ||
| } | ||
|
|
||
| // If buffering is enabled and configured | ||
| if (m_buffering_enabled) { | ||
| // If this is the first write or buffer is empty, initialize buffer offset | ||
| if (m_write_buffer.empty()) { | ||
| m_buffer_offset = offset; | ||
| } | ||
|
|
||
| // Check if this write is sequential to the buffer | ||
| off_t expected_offset = m_buffer_offset + static_cast<off_t>(m_write_buffer.size()); | ||
| if (offset != expected_offset) { | ||
| // Not sequential - flush buffer and disable buffering | ||
| int flush_result = FlushWriteBuffer(); | ||
| if (flush_result < 0) { | ||
| return flush_result; | ||
| } | ||
| m_buffering_enabled = false; | ||
| // Fall through to direct write | ||
| } else { | ||
| // Sequential write - check if we should buffer it | ||
| size_t total_size = m_write_buffer.size() + size; | ||
|
|
||
| if (total_size <= m_write_buffer_size) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are edge cases here I'd like approached differently. If the total is over the buffer size, then copy as much data as you can to fill the buffer and then write. This way, writes are always the buffer size (except maybe the last before close) at the cost of memcpy.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated to fill the buffer completely before flushing when total exceeds buffer size. After flush, remaining data is either buffered (if it fits) or written directly (if too large). All writes are now buffer-sized except possibly the last. Fixed in b830dd6. |
||
| // Buffer this write - reserve capacity to avoid reallocations | ||
| if (m_write_buffer.capacity() < total_size) { | ||
| m_write_buffer.reserve(total_size); | ||
| } | ||
| m_write_buffer.insert(m_write_buffer.end(), | ||
| static_cast<const unsigned char*>(buffer), | ||
| static_cast<const unsigned char*>(buffer) + size); | ||
| m_nextoff = offset + size; | ||
| return size; | ||
| } else { | ||
| // Buffer would exceed limit - flush and write directly | ||
| int flush_result = FlushWriteBuffer(); | ||
| if (flush_result < 0) { | ||
| return flush_result; | ||
| } | ||
| // Fall through to direct write | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Direct write (no buffering or buffer disabled) | ||
| auto result = m_wrapped->Write(buffer, offset, size); | ||
| if (result >= 0) {m_nextoff += result;} | ||
| if (m_state) | ||
| { | ||
| m_state->Update(static_cast<const unsigned char*>(buffer), size); | ||
| if (result >= 0) { | ||
| m_nextoff = offset + result; | ||
| if (m_state && result > 0) { | ||
| // Only update checksum for the data that was actually written | ||
| m_state->Update(static_cast<const unsigned char*>(buffer), result); | ||
| } | ||
| } | ||
| return result; | ||
| } | ||
|
|
||
|
|
||
|
|
||
| // FlushWriteBuffer: Writes all buffered data to the underlying file system. | ||
| // Preconditions: Must be called with m_buffer_mutex held. | ||
| // Returns: 0 on success, negative error code on failure. | ||
| // Side effects: | ||
| // - Writes buffer contents via m_wrapped->Write() | ||
| // - Updates checksum state if enabled (m_state) | ||
| // - Clears m_write_buffer and resets m_buffer_offset on success | ||
| // - Handles partial writes with retry loop | ||
| // - On failure, buffer is NOT cleared to allow retry | ||
| int MultiuserFile::FlushWriteBuffer() | ||
| { | ||
| if (m_write_buffer.empty()) { | ||
| return 0; | ||
| } | ||
|
|
||
| size_t total_written = 0; | ||
| while (total_written < m_write_buffer.size()) { | ||
| auto result = m_wrapped->Write(m_write_buffer.data() + total_written, | ||
| m_buffer_offset + total_written, | ||
| m_write_buffer.size() - total_written); | ||
| if (result < 0) { | ||
| // Write failed - don't clear buffer, return error | ||
| return result; | ||
| } | ||
| if (result == 0) { | ||
| // No progress - treat as error | ||
| return -EIO; | ||
| } | ||
| total_written += result; | ||
| } | ||
|
|
||
| if (m_state) { | ||
| m_state->Update(m_write_buffer.data(), m_write_buffer.size()); | ||
| } | ||
|
|
||
| m_write_buffer.clear(); | ||
| m_buffer_offset = -1; | ||
|
|
||
| return 0; | ||
| } | ||
|
|
||
|
|
||
| int MultiuserFile::Close(long long *retsz) | ||
| { | ||
| std::lock_guard<std::mutex> lock(m_buffer_mutex); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly as above - if the buffering is disabled in the config file, then skip the lock.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed to use |
||
|
|
||
| // Flush any remaining buffered data | ||
| if (!m_write_buffer.empty()) { | ||
| int flush_result = FlushWriteBuffer(); | ||
| if (flush_result < 0) { | ||
| m_log.Emsg("Close", "Failed to flush write buffer"); | ||
| // Continue with close anyway | ||
| } | ||
| } | ||
|
|
||
| auto close_result = m_wrapped->Close(retsz); | ||
| if (m_state) | ||
| { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When buffered writes are not enabled at constructor time (i.e., disabled in the config file), never take this mutex.
Merge the logic in https://github.com/opensciencegrid/xrootd-multiuser/pull/61/changes with the changes in this function; this work will conflict with that and it's straightforward to do both.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changed to use
std::unique_lockwithstd::defer_lock- the lock is only acquired whenm_write_buffer_size > 0. Also merged the PR #61 logic to disable checksumming (instead of returning error) on non-sequential writes. Fixed in b830dd6.