]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/port/win/io_win.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / port / win / io_win.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #if defined(OS_WIN)
11
12 #include "port/win/io_win.h"
13
14 #include "env_win.h"
15 #include "monitoring/iostats_context_imp.h"
16 #include "test_util/sync_point.h"
17 #include "util/aligned_buffer.h"
18 #include "util/coding.h"
19
20 namespace ROCKSDB_NAMESPACE {
21 namespace port {
22
23 /*
24 * DirectIOHelper
25 */
26 namespace {
27
28 const size_t kSectorSize = 512;
29
30 inline bool IsPowerOfTwo(const size_t alignment) {
31 return ((alignment) & (alignment - 1)) == 0;
32 }
33
34 inline bool IsAligned(size_t alignment, const void* ptr) {
35 return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
36 }
37 } // namespace
38
39 std::string GetWindowsErrSz(DWORD err) {
40 std::string Err;
41 LPSTR lpMsgBuf = nullptr;
42 FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
43 FORMAT_MESSAGE_IGNORE_INSERTS,
44 NULL, err,
45 0, // Default language
46 reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
47
48 if (lpMsgBuf) {
49 Err = lpMsgBuf;
50 LocalFree(lpMsgBuf);
51 }
52 return Err;
53 }
54
55 // We preserve the original name of this interface to denote the original idea
56 // behind it.
57 // All reads happen by a specified offset and pwrite interface does not change
58 // the position of the file pointer. Judging from the man page and errno it does
59 // execute
60 // lseek atomically to return the position of the file back where it was.
61 // WriteFile() does not
62 // have this capability. Therefore, for both pread and pwrite the pointer is
63 // advanced to the next position
64 // which is fine for writes because they are (should be) sequential.
65 // Because all the reads/writes happen by the specified offset, the caller in
66 // theory should not
67 // rely on the current file offset.
68 IOStatus pwrite(const WinFileData* file_data, const Slice& data,
69 uint64_t offset, size_t& bytes_written) {
70 IOStatus s;
71 bytes_written = 0;
72
73 size_t num_bytes = data.size();
74 if (num_bytes > std::numeric_limits<DWORD>::max()) {
75 // May happen in 64-bit builds where size_t is 64-bits but
76 // long is still 32-bit, but that's the API here at the moment
77 return IOStatus::InvalidArgument(
78 "num_bytes is too large for a single write: " + file_data->GetName());
79 }
80
81 OVERLAPPED overlapped = {0};
82 ULARGE_INTEGER offsetUnion;
83 offsetUnion.QuadPart = offset;
84
85 overlapped.Offset = offsetUnion.LowPart;
86 overlapped.OffsetHigh = offsetUnion.HighPart;
87
88 DWORD bytesWritten = 0;
89
90 if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(),
91 static_cast<DWORD>(num_bytes), &bytesWritten,
92 &overlapped)) {
93 auto lastError = GetLastError();
94 s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
95 lastError);
96 } else {
97 bytes_written = bytesWritten;
98 }
99
100 return s;
101 }
102
103 // See comments for pwrite above
104 IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
105 uint64_t offset, size_t& bytes_read) {
106 IOStatus s;
107 bytes_read = 0;
108
109 if (num_bytes > std::numeric_limits<DWORD>::max()) {
110 return IOStatus::InvalidArgument(
111 "num_bytes is too large for a single read: " + file_data->GetName());
112 }
113
114 OVERLAPPED overlapped = {0};
115 ULARGE_INTEGER offsetUnion;
116 offsetUnion.QuadPart = offset;
117
118 overlapped.Offset = offsetUnion.LowPart;
119 overlapped.OffsetHigh = offsetUnion.HighPart;
120
121 DWORD bytesRead = 0;
122
123 if (FALSE == ReadFile(file_data->GetFileHandle(), src,
124 static_cast<DWORD>(num_bytes), &bytesRead,
125 &overlapped)) {
126 auto lastError = GetLastError();
127 // EOF is OK with zero bytes read
128 if (lastError != ERROR_HANDLE_EOF) {
129 s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
130 lastError);
131 }
132 } else {
133 bytes_read = bytesRead;
134 }
135
136 return s;
137 }
138
139 // SetFileInformationByHandle() is capable of fast pre-allocates.
140 // However, this does not change the file end position unless the file is
141 // truncated and the pre-allocated space is not considered filled with zeros.
142 IOStatus fallocate(const std::string& filename, HANDLE hFile,
143 uint64_t to_size) {
144 IOStatus status;
145
146 FILE_ALLOCATION_INFO alloc_info;
147 alloc_info.AllocationSize.QuadPart = to_size;
148
149 if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
150 sizeof(FILE_ALLOCATION_INFO))) {
151 auto lastError = GetLastError();
152 status = IOErrorFromWindowsError(
153 "Failed to pre-allocate space: " + filename, lastError);
154 }
155
156 return status;
157 }
158
159 IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) {
160 IOStatus status;
161
162 FILE_END_OF_FILE_INFO end_of_file;
163 end_of_file.EndOfFile.QuadPart = toSize;
164
165 if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
166 sizeof(FILE_END_OF_FILE_INFO))) {
167 auto lastError = GetLastError();
168 status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
169 lastError);
170 }
171
172 return status;
173 }
174
175 size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
176 size_t /*max_size*/) {
177 // Returning 0 is safe as it causes the table reader to generate a unique ID.
178 // This is suboptimal for performance as it prevents multiple table readers
179 // for the same file from sharing cached blocks. For example, if users have
180 // a low value for `max_open_files`, there can be many table readers opened
181 // for the same file.
182 //
183 // TODO: this is a temporarily solution as it is safe but not optimal for
184 // performance. For more details see discussion in
185 // https://github.com/facebook/rocksdb/pull/5844.
186 return 0;
187 }
188
189 WinFileData::WinFileData(const std::string& filename, HANDLE hFile,
190 bool direct_io)
191 : filename_(filename),
192 hFile_(hFile),
193 use_direct_io_(direct_io),
194 sector_size_(WinFileSystem::GetSectorSize(filename)) {}
195
196 bool WinFileData::IsSectorAligned(const size_t off) const {
197 return (off & (sector_size_ - 1)) == 0;
198 }
199
200 ////////////////////////////////////////////////////////////////////////////////////////////////////
201 // WinMmapReadableFile
202
203 WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
204 HANDLE hFile, HANDLE hMap,
205 const void* mapped_region,
206 size_t length)
207 : WinFileData(fileName, hFile, false /* use_direct_io */),
208 hMap_(hMap),
209 mapped_region_(mapped_region),
210 length_(length) {}
211
212 WinMmapReadableFile::~WinMmapReadableFile() {
213 BOOL ret __attribute__((__unused__));
214 ret = ::UnmapViewOfFile(mapped_region_);
215 assert(ret);
216
217 ret = ::CloseHandle(hMap_);
218 assert(ret);
219 }
220
221 IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n,
222 const IOOptions& /*options*/, Slice* result,
223 char* scratch,
224 IODebugContext* /*dbg*/) const {
225 IOStatus s;
226
227 if (offset > length_) {
228 *result = Slice();
229 return IOError(filename_, EINVAL);
230 } else if (offset + n > length_) {
231 n = length_ - static_cast<size_t>(offset);
232 }
233 *result = Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
234 return s;
235 }
236
237 IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
238 return IOStatus::OK();
239 }
240
241 size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
242 return GetUniqueIdFromFile(hFile_, id, max_size);
243 }
244
245 ///////////////////////////////////////////////////////////////////////////////
246 /// WinMmapFile
247
248 // Can only truncate or reserve to a sector size aligned if
249 // used on files that are opened with Unbuffered I/O
250 IOStatus WinMmapFile::TruncateFile(uint64_t toSize) {
251 return ftruncate(filename_, hFile_, toSize);
252 }
253
254 IOStatus WinMmapFile::UnmapCurrentRegion() {
255 IOStatus status;
256
257 if (mapped_begin_ != nullptr) {
258 if (!::UnmapViewOfFile(mapped_begin_)) {
259 status = IOErrorFromWindowsError(
260 "Failed to unmap file view: " + filename_, GetLastError());
261 }
262
263 // Move on to the next portion of the file
264 file_offset_ += view_size_;
265
266 // UnmapView automatically sends data to disk but not the metadata
267 // which is good and provides some equivalent of fdatasync() on Linux
268 // therefore, we donot need separate flag for metadata
269 mapped_begin_ = nullptr;
270 mapped_end_ = nullptr;
271 dst_ = nullptr;
272
273 last_sync_ = nullptr;
274 pending_sync_ = false;
275 }
276
277 return status;
278 }
279
280 IOStatus WinMmapFile::MapNewRegion(const IOOptions& options,
281 IODebugContext* dbg) {
282 IOStatus status;
283
284 assert(mapped_begin_ == nullptr);
285
286 size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
287
288 if (minDiskSize > reserved_size_) {
289 status = Allocate(file_offset_, view_size_, options, dbg);
290 if (!status.ok()) {
291 return status;
292 }
293 }
294
295 // Need to remap
296 if (hMap_ == NULL || reserved_size_ > mapping_size_) {
297 if (hMap_ != NULL) {
298 // Unmap the previous one
299 BOOL ret __attribute__((__unused__));
300 ret = ::CloseHandle(hMap_);
301 assert(ret);
302 hMap_ = NULL;
303 }
304
305 ULARGE_INTEGER mappingSize;
306 mappingSize.QuadPart = reserved_size_;
307
308 hMap_ = CreateFileMappingA(
309 hFile_,
310 NULL, // Security attributes
311 PAGE_READWRITE, // There is not a write only mode for mapping
312 mappingSize.HighPart, // Enable mapping the whole file but the actual
313 // amount mapped is determined by MapViewOfFile
314 mappingSize.LowPart,
315 NULL); // Mapping name
316
317 if (NULL == hMap_) {
318 return IOErrorFromWindowsError(
319 "WindowsMmapFile failed to create file mapping for: " + filename_,
320 GetLastError());
321 }
322
323 mapping_size_ = reserved_size_;
324 }
325
326 ULARGE_INTEGER offset;
327 offset.QuadPart = file_offset_;
328
329 // View must begin at the granularity aligned offset
330 mapped_begin_ = reinterpret_cast<char*>(
331 MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
332 view_size_, NULL));
333
334 if (!mapped_begin_) {
335 status = IOErrorFromWindowsError(
336 "WindowsMmapFile failed to map file view: " + filename_,
337 GetLastError());
338 } else {
339 mapped_end_ = mapped_begin_ + view_size_;
340 dst_ = mapped_begin_;
341 last_sync_ = mapped_begin_;
342 pending_sync_ = false;
343 }
344 return status;
345 }
346
347 IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
348 return fallocate(filename_, hFile_, spaceToReserve);
349 }
350
351 WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
352 size_t page_size, size_t allocation_granularity,
353 const FileOptions& options)
354 : WinFileData(fname, hFile, false),
355 FSWritableFile(options),
356 hMap_(NULL),
357 page_size_(page_size),
358 allocation_granularity_(allocation_granularity),
359 reserved_size_(0),
360 mapping_size_(0),
361 view_size_(0),
362 mapped_begin_(nullptr),
363 mapped_end_(nullptr),
364 dst_(nullptr),
365 last_sync_(nullptr),
366 file_offset_(0),
367 pending_sync_(false) {
368 // Allocation granularity must be obtained from GetSystemInfo() and must be
369 // a power of two.
370 assert(allocation_granularity > 0);
371 assert((allocation_granularity & (allocation_granularity - 1)) == 0);
372
373 assert(page_size > 0);
374 assert((page_size & (page_size - 1)) == 0);
375
376 // Only for memory mapped writes
377 assert(options.use_mmap_writes);
378
379 // View size must be both the multiple of allocation_granularity AND the
380 // page size and the granularity is usually a multiple of a page size.
381 const size_t viewSize =
382 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
383 view_size_ = Roundup(viewSize, allocation_granularity_);
384 }
385
386 WinMmapFile::~WinMmapFile() {
387 if (hFile_) {
388 this->Close(IOOptions(), nullptr);
389 }
390 }
391
392 IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options,
393 IODebugContext* dbg) {
394 const char* src = data.data();
395 size_t left = data.size();
396
397 while (left > 0) {
398 assert(mapped_begin_ <= dst_);
399 size_t avail = mapped_end_ - dst_;
400
401 if (avail == 0) {
402 IOStatus s = UnmapCurrentRegion();
403 if (s.ok()) {
404 s = MapNewRegion(options, dbg);
405 }
406
407 if (!s.ok()) {
408 return s;
409 }
410 } else {
411 size_t n = std::min(left, avail);
412 memcpy(dst_, src, n);
413 dst_ += n;
414 src += n;
415 left -= n;
416 pending_sync_ = true;
417 }
418 }
419
420 // Now make sure that the last partial page is padded with zeros if needed
421 size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
422 if (bytesToPad > 0) {
423 memset(dst_, 0, bytesToPad);
424 }
425
426 return IOStatus::OK();
427 }
428
429 // Means Close() will properly take care of truncate
430 // and it does not need any additional information
431 IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/,
432 IODebugContext* /*dbg*/) {
433 return IOStatus::OK();
434 }
435
436 IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) {
437 IOStatus s;
438
439 assert(NULL != hFile_);
440
441 // We truncate to the precise size so no
442 // uninitialized data at the end. SetEndOfFile
443 // which we use does not write zeros and it is good.
444 uint64_t targetSize = GetFileSize(options, dbg);
445
446 if (mapped_begin_ != nullptr) {
447 // Sync before unmapping to make sure everything
448 // is on disk and there is not a lazy writing
449 // so we are deterministic with the tests
450 Sync(options, dbg);
451 s = UnmapCurrentRegion();
452 }
453
454 if (NULL != hMap_) {
455 BOOL ret = ::CloseHandle(hMap_);
456 if (!ret && s.ok()) {
457 auto lastError = GetLastError();
458 s = IOErrorFromWindowsError(
459 "Failed to Close mapping for file: " + filename_, lastError);
460 }
461
462 hMap_ = NULL;
463 }
464
465 if (hFile_ != NULL) {
466 TruncateFile(targetSize);
467
468 BOOL ret = ::CloseHandle(hFile_);
469 hFile_ = NULL;
470
471 if (!ret && s.ok()) {
472 auto lastError = GetLastError();
473 s = IOErrorFromWindowsError(
474 "Failed to close file map handle: " + filename_, lastError);
475 }
476 }
477
478 return s;
479 }
480
481 IOStatus WinMmapFile::Flush(const IOOptions& /*options*/,
482 IODebugContext* /*dbg*/) {
483 return IOStatus::OK();
484 }
485
486 // Flush only data
487 IOStatus WinMmapFile::Sync(const IOOptions& /*options*/,
488 IODebugContext* /*dbg*/) {
489 IOStatus s;
490
491 // Some writes occurred since last sync
492 if (dst_ > last_sync_) {
493 assert(mapped_begin_);
494 assert(dst_);
495 assert(dst_ > mapped_begin_);
496 assert(dst_ < mapped_end_);
497
498 size_t page_begin =
499 TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
500 size_t page_end =
501 TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
502
503 // Flush only the amount of that is a multiple of pages
504 if (!::FlushViewOfFile(mapped_begin_ + page_begin,
505 (page_end - page_begin) + page_size_)) {
506 s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
507 GetLastError());
508 } else {
509 last_sync_ = dst_;
510 }
511 }
512
513 return s;
514 }
515
516 /**
517 * Flush data as well as metadata to stable storage.
518 */
519 IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
520 IOStatus s = Sync(options, dbg);
521
522 // Flush metadata
523 if (s.ok() && pending_sync_) {
524 if (!::FlushFileBuffers(hFile_)) {
525 s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
526 GetLastError());
527 }
528 pending_sync_ = false;
529 }
530
531 return s;
532 }
533
534 /**
535 * Get the size of valid data in the file. This will not match the
536 * size that is returned from the filesystem because we use mmap
537 * to extend file by map_size every time.
538 */
539 uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/,
540 IODebugContext* /*dbg*/) {
541 size_t used = dst_ - mapped_begin_;
542 return file_offset_ + used;
543 }
544
545 IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) {
546 return IOStatus::OK();
547 }
548
549 IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len,
550 const IOOptions& /*options*/,
551 IODebugContext* /*dbg*/) {
552 IOStatus status;
553 TEST_KILL_RANDOM("WinMmapFile::Allocate");
554
555 // Make sure that we reserve an aligned amount of space
556 // since the reservation block size is driven outside so we want
557 // to check if we are ok with reservation here
558 size_t spaceToReserve =
559 Roundup(static_cast<size_t>(offset + len), view_size_);
560 // Nothing to do
561 if (spaceToReserve <= reserved_size_) {
562 return status;
563 }
564
565 IOSTATS_TIMER_GUARD(allocate_nanos);
566 status = PreallocateInternal(spaceToReserve);
567 if (status.ok()) {
568 reserved_size_ = spaceToReserve;
569 }
570 return status;
571 }
572
573 size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
574 return GetUniqueIdFromFile(hFile_, id, max_size);
575 }
576
577 //////////////////////////////////////////////////////////////////////////////////
578 // WinSequentialFile
579
580 WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
581 const FileOptions& options)
582 : WinFileData(fname, f, options.use_direct_reads) {}
583
584 WinSequentialFile::~WinSequentialFile() {
585 assert(hFile_ != INVALID_HANDLE_VALUE);
586 }
587
588 IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
589 Slice* result, char* scratch,
590 IODebugContext* /*dbg*/) {
591 IOStatus s;
592 size_t r = 0;
593
594 assert(result != nullptr);
595 if (WinFileData::use_direct_io()) {
596 return IOStatus::NotSupported("Read() does not support direct_io");
597 }
598
599 // Windows ReadFile API accepts a DWORD.
600 // While it is possible to read in a loop if n is too big
601 // it is an unlikely case.
602 if (n > std::numeric_limits<DWORD>::max()) {
603 return IOStatus::InvalidArgument("n is too big for a single ReadFile: " +
604 filename_);
605 }
606
607 DWORD bytesToRead =
608 static_cast<DWORD>(n); // cast is safe due to the check above
609 DWORD bytesRead = 0;
610 BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
611 if (ret != FALSE) {
612 r = bytesRead;
613 } else {
614 auto lastError = GetLastError();
615 if (lastError != ERROR_HANDLE_EOF) {
616 s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError);
617 }
618 }
619
620 *result = Slice(scratch, r);
621 return s;
622 }
623
624 IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
625 uint64_t offset,
626 size_t& bytes_read) const {
627 return pread(this, src, numBytes, offset, bytes_read);
628 }
629
630 IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n,
631 const IOOptions& /*opts*/,
632 Slice* result, char* scratch,
633 IODebugContext* /*dbg*/) {
634 if (!WinFileData::use_direct_io()) {
635 return IOStatus::NotSupported("This function is only used for direct_io");
636 }
637
638 assert(IsSectorAligned(static_cast<size_t>(offset)));
639 assert(IsSectorAligned(static_cast<size_t>(n)));
640
641 size_t bytes_read = 0; // out param
642 IOStatus s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset,
643 bytes_read);
644 *result = Slice(scratch, bytes_read);
645 return s;
646 }
647
648 IOStatus WinSequentialFile::Skip(uint64_t n) {
649 // Can't handle more than signed max as SetFilePointerEx accepts a signed
650 // 64-bit integer. As such it is a highly unlikley case to have n so large.
651 if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
652 return IOStatus::InvalidArgument(
653 "n is too large for a single SetFilePointerEx() call" + filename_);
654 }
655
656 LARGE_INTEGER li;
657 li.QuadPart = static_cast<LONGLONG>(n); // cast is safe due to the check
658 // above
659 BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
660 if (ret == FALSE) {
661 auto lastError = GetLastError();
662 return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
663 lastError);
664 }
665 return IOStatus::OK();
666 }
667
668 IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
669 return IOStatus::OK();
670 }
671
672 //////////////////////////////////////////////////////////////////////////////////////////////////
673 /// WinRandomAccessBase
674
675 inline IOStatus WinRandomAccessImpl::PositionedReadInternal(
676 char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const {
677 return pread(file_base_, src, numBytes, offset, bytes_read);
678 }
679
680 inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
681 size_t alignment,
682 const FileOptions& options)
683 : file_base_(file_base),
684 alignment_(std::max(alignment, file_base->GetSectorSize())) {
685 assert(!options.use_mmap_reads);
686 }
687
688 inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n,
689 Slice* result,
690 char* scratch) const {
691 // Check buffer alignment
692 if (file_base_->use_direct_io()) {
693 assert(file_base_->IsSectorAligned(static_cast<size_t>(offset)));
694 assert(IsAligned(alignment_, scratch));
695 }
696
697 if (n == 0) {
698 *result = Slice(scratch, 0);
699 return IOStatus::OK();
700 }
701
702 size_t bytes_read = 0;
703 IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read);
704 *result = Slice(scratch, bytes_read);
705 return s;
706 }
707
708 ///////////////////////////////////////////////////////////////////////////////////////////////////
709 /// WinRandomAccessFile
710
711 WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
712 size_t alignment,
713 const FileOptions& options)
714 : WinFileData(fname, hFile, options.use_direct_reads),
715 WinRandomAccessImpl(this, alignment, options) {}
716
717 WinRandomAccessFile::~WinRandomAccessFile() {}
718
719 IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n,
720 const IOOptions& /*options*/, Slice* result,
721 char* scratch,
722 IODebugContext* /*dbg*/) const {
723 return ReadImpl(offset, n, result, scratch);
724 }
725
726 IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
727 return IOStatus::OK();
728 }
729
730 size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
731 return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
732 }
733
734 size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
735 return GetAlignment();
736 }
737
738 /////////////////////////////////////////////////////////////////////////////
739 // WinWritableImpl
740 //
741
742 inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
743 return fallocate(file_data_->GetName(), file_data_->GetFileHandle(),
744 spaceToReserve);
745 }
746
747 inline WinWritableImpl::WinWritableImpl(WinFileData* file_data,
748 size_t alignment)
749 : file_data_(file_data),
750 alignment_(std::max(alignment, file_data->GetSectorSize())),
751 next_write_offset_(0),
752 reservedsize_(0) {
753 // Query current position in case ReopenWritableFile is called
754 // This position is only important for buffered writes
755 // for unbuffered writes we explicitely specify the position.
756 LARGE_INTEGER zero_move;
757 zero_move.QuadPart = 0; // Do not move
758 LARGE_INTEGER pos;
759 pos.QuadPart = 0;
760 BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
761 FILE_CURRENT);
762 // Querying no supped to fail
763 if (ret != 0) {
764 next_write_offset_ = pos.QuadPart;
765 } else {
766 assert(false);
767 }
768 }
769
770 inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) {
771 IOStatus s;
772
773 if (data.size() > std::numeric_limits<DWORD>::max()) {
774 return IOStatus::InvalidArgument("data is too long for a single write" +
775 file_data_->GetName());
776 }
777
778 size_t bytes_written = 0; // out param
779
780 if (file_data_->use_direct_io()) {
781 // With no offset specified we are appending
782 // to the end of the file
783 assert(file_data_->IsSectorAligned(next_write_offset_));
784 assert(file_data_->IsSectorAligned(data.size()));
785 assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
786 s = pwrite(file_data_, data, next_write_offset_, bytes_written);
787 } else {
788 DWORD bytesWritten = 0;
789 if (!WriteFile(file_data_->GetFileHandle(), data.data(),
790 static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
791 auto lastError = GetLastError();
792 s = IOErrorFromWindowsError(
793 "Failed to WriteFile: " + file_data_->GetName(), lastError);
794 } else {
795 bytes_written = bytesWritten;
796 }
797 }
798
799 if (s.ok()) {
800 if (bytes_written == data.size()) {
801 // This matters for direct_io cases where
802 // we rely on the fact that next_write_offset_
803 // is sector aligned
804 next_write_offset_ += bytes_written;
805 } else {
806 s = IOStatus::IOError("Failed to write all bytes: " +
807 file_data_->GetName());
808 }
809 }
810
811 return s;
812 }
813
814 inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data,
815 uint64_t offset) {
816 if (file_data_->use_direct_io()) {
817 assert(file_data_->IsSectorAligned(static_cast<size_t>(offset)));
818 assert(file_data_->IsSectorAligned(data.size()));
819 assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
820 }
821
822 size_t bytes_written = 0;
823 IOStatus s = pwrite(file_data_, data, offset, bytes_written);
824
825 if (s.ok()) {
826 if (bytes_written == data.size()) {
827 // For sequential write this would be simple
828 // size extension by data.size()
829 uint64_t write_end = offset + bytes_written;
830 if (write_end >= next_write_offset_) {
831 next_write_offset_ = write_end;
832 }
833 } else {
834 s = IOStatus::IOError("Failed to write all of the requested data: " +
835 file_data_->GetName());
836 }
837 }
838 return s;
839 }
840
841 inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) {
842 // It is tempting to check for the size for sector alignment
843 // but truncation may come at the end and there is not a requirement
844 // for this to be sector aligned so long as we do not attempt to write
845 // after that. The interface docs state that the behavior is undefined
846 // in that case.
847 IOStatus s =
848 ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size);
849
850 if (s.ok()) {
851 next_write_offset_ = size;
852 }
853 return s;
854 }
855
856 inline IOStatus WinWritableImpl::CloseImpl() {
857 IOStatus s;
858
859 auto hFile = file_data_->GetFileHandle();
860 assert(INVALID_HANDLE_VALUE != hFile);
861
862 if (!::FlushFileBuffers(hFile)) {
863 auto lastError = GetLastError();
864 s = IOErrorFromWindowsError(
865 "FlushFileBuffers failed at Close() for: " + file_data_->GetName(),
866 lastError);
867 }
868
869 if (!file_data_->CloseFile() && s.ok()) {
870 auto lastError = GetLastError();
871 s = IOErrorFromWindowsError(
872 "CloseHandle failed for: " + file_data_->GetName(), lastError);
873 }
874 return s;
875 }
876
877 inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/,
878 IODebugContext* /*dbg*/) {
879 IOStatus s;
880 if (!::FlushFileBuffers(file_data_->GetFileHandle())) {
881 auto lastError = GetLastError();
882 s = IOErrorFromWindowsError(
883 "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(),
884 lastError);
885 }
886 return s;
887 }
888
889 inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
890 IOStatus status;
891 TEST_KILL_RANDOM("WinWritableFile::Allocate");
892
893 // Make sure that we reserve an aligned amount of space
894 // since the reservation block size is driven outside so we want
895 // to check if we are ok with reservation here
896 size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len),
897 static_cast<size_t>(alignment_));
898 // Nothing to do
899 if (spaceToReserve <= reservedsize_) {
900 return status;
901 }
902
903 IOSTATS_TIMER_GUARD(allocate_nanos);
904 status = PreallocateInternal(spaceToReserve);
905 if (status.ok()) {
906 reservedsize_ = spaceToReserve;
907 }
908 return status;
909 }
910
911 ////////////////////////////////////////////////////////////////////////////////
912 /// WinWritableFile
913
914 WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
915 size_t alignment, size_t /* capacity */,
916 const FileOptions& options)
917 : WinFileData(fname, hFile, options.use_direct_writes),
918 WinWritableImpl(this, alignment),
919 FSWritableFile(options) {
920 assert(!options.use_mmap_writes);
921 }
922
923 WinWritableFile::~WinWritableFile() {}
924
925 // Indicates if the class makes use of direct I/O
926 bool WinWritableFile::use_direct_io() const {
927 return WinFileData::use_direct_io();
928 }
929
930 size_t WinWritableFile::GetRequiredBufferAlignment() const {
931 return static_cast<size_t>(GetAlignment());
932 }
933
934 IOStatus WinWritableFile::Append(const Slice& data,
935 const IOOptions& /*options*/,
936 IODebugContext* /*dbg*/) {
937 return AppendImpl(data);
938 }
939
940 IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
941 const IOOptions& /*options*/,
942 IODebugContext* /*dbg*/) {
943 return PositionedAppendImpl(data, offset);
944 }
945
946 // Need to implement this so the file is truncated correctly
947 // when buffered and unbuffered mode
948 IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/,
949 IODebugContext* /*dbg*/) {
950 return TruncateImpl(size);
951 }
952
953 IOStatus WinWritableFile::Close(const IOOptions& /*options*/,
954 IODebugContext* /*dbg*/) {
955 return CloseImpl();
956 }
957
958 // write out the cached data to the OS cache
959 // This is now taken care of the WritableFileWriter
960 IOStatus WinWritableFile::Flush(const IOOptions& /*options*/,
961 IODebugContext* /*dbg*/) {
962 return IOStatus::OK();
963 }
964
965 IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) {
966 return SyncImpl(options, dbg);
967 }
968
969 IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
970 return SyncImpl(options, dbg);
971 }
972
973 bool WinWritableFile::IsSyncThreadSafe() const { return true; }
974
975 uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/,
976 IODebugContext* /*dbg*/) {
977 return GetFileNextWriteOffset();
978 }
979
980 IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len,
981 const IOOptions& /*options*/,
982 IODebugContext* /*dbg*/) {
983 return AllocateImpl(offset, len);
984 }
985
986 size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
987 return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
988 }
989
990 /////////////////////////////////////////////////////////////////////////
991 /// WinRandomRWFile
992
993 WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
994 size_t alignment, const FileOptions& options)
995 : WinFileData(fname, hFile,
996 options.use_direct_reads && options.use_direct_writes),
997 WinRandomAccessImpl(this, alignment, options),
998 WinWritableImpl(this, alignment) {}
999
1000 bool WinRandomRWFile::use_direct_io() const {
1001 return WinFileData::use_direct_io();
1002 }
1003
1004 size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
1005 assert(WinRandomAccessImpl::GetAlignment() ==
1006 WinWritableImpl::GetAlignment());
1007 return static_cast<size_t>(WinRandomAccessImpl::GetAlignment());
1008 }
1009
1010 IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data,
1011 const IOOptions& /*options*/,
1012 IODebugContext* /*dbg*/) {
1013 return PositionedAppendImpl(data, offset);
1014 }
1015
1016 IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n,
1017 const IOOptions& /*options*/, Slice* result,
1018 char* scratch, IODebugContext* /*dbg*/) const {
1019 return ReadImpl(offset, n, result, scratch);
1020 }
1021
1022 IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/,
1023 IODebugContext* /*dbg*/) {
1024 return IOStatus::OK();
1025 }
1026
1027 IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) {
1028 return SyncImpl(options, dbg);
1029 }
1030
1031 IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/,
1032 IODebugContext* /*dbg*/) {
1033 return CloseImpl();
1034 }
1035
1036 //////////////////////////////////////////////////////////////////////////
1037 /// WinMemoryMappedBufer
1038 WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
1039 BOOL ret
1040 #if defined(_MSC_VER)
1041 = FALSE;
1042 #else
1043 __attribute__((__unused__));
1044 #endif
1045 if (base_ != nullptr) {
1046 ret = ::UnmapViewOfFile(base_);
1047 assert(ret);
1048 base_ = nullptr;
1049 }
1050 if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
1051 ret = ::CloseHandle(map_handle_);
1052 assert(ret);
1053 map_handle_ = NULL;
1054 }
1055 if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
1056 ret = ::CloseHandle(file_handle_);
1057 assert(ret);
1058 file_handle_ = NULL;
1059 }
1060 }
1061
1062 //////////////////////////////////////////////////////////////////////////
1063 /// WinDirectory
1064
1065 IOStatus WinDirectory::Fsync(const IOOptions& /*options*/,
1066 IODebugContext* /*dbg*/) {
1067 return IOStatus::OK();
1068 }
1069
1070 IOStatus WinDirectory::Close(const IOOptions& /*options*/,
1071 IODebugContext* /*dbg*/) {
1072 IOStatus s = IOStatus::OK();
1073 BOOL ret __attribute__((__unused__));
1074 if (handle_ != INVALID_HANDLE_VALUE) {
1075 ret = ::CloseHandle(handle_);
1076 if (!ret) {
1077 auto lastError = GetLastError();
1078 s = IOErrorFromWindowsError("Directory closes failed for : " + GetName(),
1079 lastError);
1080 }
1081 handle_ = NULL;
1082 }
1083 return s;
1084 }
1085
1086 size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
1087 return GetUniqueIdFromFile(handle_, id, max_size);
1088 }
1089 //////////////////////////////////////////////////////////////////////////
1090 /// WinFileLock
1091
1092 WinFileLock::~WinFileLock() {
1093 BOOL ret __attribute__((__unused__));
1094 ret = ::CloseHandle(hFile_);
1095 assert(ret);
1096 }
1097
1098 } // namespace port
1099 } // namespace ROCKSDB_NAMESPACE
1100
1101 #endif