]> git.proxmox.com Git - ceph.git/blob - ceph/src/rocksdb/port/win/io_win.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / rocksdb / port / win / io_win.cc
1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under the BSD-style license found in the
3 // LICENSE file in the root directory of this source tree. An additional grant
4 // of patent rights can be found in the PATENTS file in the same directory.
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "port/win/io_win.h"
11
12 #include "monitoring/iostats_context_imp.h"
13 #include "util/aligned_buffer.h"
14 #include "util/coding.h"
15 #include "util/sync_point.h"
16
17 namespace rocksdb {
18 namespace port {
19
20 /*
21 * DirectIOHelper
22 */
23 namespace {
24
25 const size_t kSectorSize = 512;
26
27 inline
28 bool IsPowerOfTwo(const size_t alignment) {
29 return ((alignment) & (alignment - 1)) == 0;
30 }
31
32 inline
33 bool IsSectorAligned(const size_t off) {
34 return (off & (kSectorSize - 1)) == 0;
35 }
36
37 inline
38 bool IsAligned(size_t alignment, const void* ptr) {
39 return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
40 }
41 }
42
43
44 std::string GetWindowsErrSz(DWORD err) {
45 LPSTR lpMsgBuf;
46 FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
47 FORMAT_MESSAGE_IGNORE_INSERTS,
48 NULL, err,
49 0, // Default language
50 reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
51
52 std::string Err = lpMsgBuf;
53 LocalFree(lpMsgBuf);
54 return Err;
55 }
56
57 // We preserve the original name of this interface to denote the original idea
58 // behind it.
59 // All reads happen by a specified offset and pwrite interface does not change
60 // the position of the file pointer. Judging from the man page and errno it does
61 // execute
62 // lseek atomically to return the position of the file back where it was.
63 // WriteFile() does not
64 // have this capability. Therefore, for both pread and pwrite the pointer is
65 // advanced to the next position
66 // which is fine for writes because they are (should be) sequential.
67 // Because all the reads/writes happen by the specified offset, the caller in
68 // theory should not
69 // rely on the current file offset.
70 SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes,
71 uint64_t offset) {
72 assert(numBytes <= std::numeric_limits<DWORD>::max());
73 OVERLAPPED overlapped = { 0 };
74 ULARGE_INTEGER offsetUnion;
75 offsetUnion.QuadPart = offset;
76
77 overlapped.Offset = offsetUnion.LowPart;
78 overlapped.OffsetHigh = offsetUnion.HighPart;
79
80 SSIZE_T result = 0;
81
82 unsigned long bytesWritten = 0;
83
84 if (FALSE == WriteFile(hFile, src, static_cast<DWORD>(numBytes), &bytesWritten,
85 &overlapped)) {
86 result = -1;
87 } else {
88 result = bytesWritten;
89 }
90
91 return result;
92 }
93
94 // See comments for pwrite above
95 SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset) {
96 assert(numBytes <= std::numeric_limits<DWORD>::max());
97 OVERLAPPED overlapped = { 0 };
98 ULARGE_INTEGER offsetUnion;
99 offsetUnion.QuadPart = offset;
100
101 overlapped.Offset = offsetUnion.LowPart;
102 overlapped.OffsetHigh = offsetUnion.HighPart;
103
104 SSIZE_T result = 0;
105
106 unsigned long bytesRead = 0;
107
108 if (FALSE == ReadFile(hFile, src, static_cast<DWORD>(numBytes), &bytesRead,
109 &overlapped)) {
110 return -1;
111 } else {
112 result = bytesRead;
113 }
114
115 return result;
116 }
117
118 // SetFileInformationByHandle() is capable of fast pre-allocates.
119 // However, this does not change the file end position unless the file is
120 // truncated and the pre-allocated space is not considered filled with zeros.
121 Status fallocate(const std::string& filename, HANDLE hFile,
122 uint64_t to_size) {
123 Status status;
124
125 FILE_ALLOCATION_INFO alloc_info;
126 alloc_info.AllocationSize.QuadPart = to_size;
127
128 if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
129 sizeof(FILE_ALLOCATION_INFO))) {
130 auto lastError = GetLastError();
131 status = IOErrorFromWindowsError(
132 "Failed to pre-allocate space: " + filename, lastError);
133 }
134
135 return status;
136 }
137
138 Status ftruncate(const std::string& filename, HANDLE hFile,
139 uint64_t toSize) {
140 Status status;
141
142 FILE_END_OF_FILE_INFO end_of_file;
143 end_of_file.EndOfFile.QuadPart = toSize;
144
145 if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
146 sizeof(FILE_END_OF_FILE_INFO))) {
147 auto lastError = GetLastError();
148 status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
149 lastError);
150 }
151
152 return status;
153 }
154
155 size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) {
156
157 if (max_size < kMaxVarint64Length * 3) {
158 return 0;
159 }
160
161 // This function has to be re-worked for cases when
162 // ReFS file system introduced on Windows Server 2012 is used
163 BY_HANDLE_FILE_INFORMATION FileInfo;
164
165 BOOL result = GetFileInformationByHandle(hFile, &FileInfo);
166
167 TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
168
169 if (!result) {
170 return 0;
171 }
172
173 char* rid = id;
174 rid = EncodeVarint64(rid, uint64_t(FileInfo.dwVolumeSerialNumber));
175 rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexHigh));
176 rid = EncodeVarint64(rid, uint64_t(FileInfo.nFileIndexLow));
177
178 assert(rid >= id);
179 return static_cast<size_t>(rid - id);
180 }
181
182 ////////////////////////////////////////////////////////////////////////////////////////////////////
183 // WinMmapReadableFile
184
185 WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
186 HANDLE hFile, HANDLE hMap,
187 const void* mapped_region,
188 size_t length)
189 : WinFileData(fileName, hFile, false /* use_direct_io */),
190 hMap_(hMap),
191 mapped_region_(mapped_region),
192 length_(length) {}
193
194 WinMmapReadableFile::~WinMmapReadableFile() {
195 BOOL ret = ::UnmapViewOfFile(mapped_region_);
196 assert(ret);
197
198 ret = ::CloseHandle(hMap_);
199 assert(ret);
200 }
201
202 Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
203 char* scratch) const {
204 Status s;
205
206 if (offset > length_) {
207 *result = Slice();
208 return IOError(filename_, EINVAL);
209 } else if (offset + n > length_) {
210 n = length_ - offset;
211 }
212 *result =
213 Slice(reinterpret_cast<const char*>(mapped_region_)+offset, n);
214 return s;
215 }
216
217 Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
218 return Status::OK();
219 }
220
221 size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
222 return GetUniqueIdFromFile(hFile_, id, max_size);
223 }
224
225 ///////////////////////////////////////////////////////////////////////////////
226 /// WinMmapFile
227
228
229 // Can only truncate or reserve to a sector size aligned if
230 // used on files that are opened with Unbuffered I/O
231 Status WinMmapFile::TruncateFile(uint64_t toSize) {
232 return ftruncate(filename_, hFile_, toSize);
233 }
234
235 Status WinMmapFile::UnmapCurrentRegion() {
236 Status status;
237
238 if (mapped_begin_ != nullptr) {
239 if (!::UnmapViewOfFile(mapped_begin_)) {
240 status = IOErrorFromWindowsError(
241 "Failed to unmap file view: " + filename_, GetLastError());
242 }
243
244 // Move on to the next portion of the file
245 file_offset_ += view_size_;
246
247 // UnmapView automatically sends data to disk but not the metadata
248 // which is good and provides some equivalent of fdatasync() on Linux
249 // therefore, we donot need separate flag for metadata
250 mapped_begin_ = nullptr;
251 mapped_end_ = nullptr;
252 dst_ = nullptr;
253
254 last_sync_ = nullptr;
255 pending_sync_ = false;
256 }
257
258 return status;
259 }
260
261 Status WinMmapFile::MapNewRegion() {
262
263 Status status;
264
265 assert(mapped_begin_ == nullptr);
266
267 size_t minDiskSize = file_offset_ + view_size_;
268
269 if (minDiskSize > reserved_size_) {
270 status = Allocate(file_offset_, view_size_);
271 if (!status.ok()) {
272 return status;
273 }
274 }
275
276 // Need to remap
277 if (hMap_ == NULL || reserved_size_ > mapping_size_) {
278
279 if (hMap_ != NULL) {
280 // Unmap the previous one
281 BOOL ret = ::CloseHandle(hMap_);
282 assert(ret);
283 hMap_ = NULL;
284 }
285
286 ULARGE_INTEGER mappingSize;
287 mappingSize.QuadPart = reserved_size_;
288
289 hMap_ = CreateFileMappingA(
290 hFile_,
291 NULL, // Security attributes
292 PAGE_READWRITE, // There is not a write only mode for mapping
293 mappingSize.HighPart, // Enable mapping the whole file but the actual
294 // amount mapped is determined by MapViewOfFile
295 mappingSize.LowPart,
296 NULL); // Mapping name
297
298 if (NULL == hMap_) {
299 return IOErrorFromWindowsError(
300 "WindowsMmapFile failed to create file mapping for: " + filename_,
301 GetLastError());
302 }
303
304 mapping_size_ = reserved_size_;
305 }
306
307 ULARGE_INTEGER offset;
308 offset.QuadPart = file_offset_;
309
310 // View must begin at the granularity aligned offset
311 mapped_begin_ = reinterpret_cast<char*>(
312 MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
313 view_size_, NULL));
314
315 if (!mapped_begin_) {
316 status = IOErrorFromWindowsError(
317 "WindowsMmapFile failed to map file view: " + filename_,
318 GetLastError());
319 } else {
320 mapped_end_ = mapped_begin_ + view_size_;
321 dst_ = mapped_begin_;
322 last_sync_ = mapped_begin_;
323 pending_sync_ = false;
324 }
325 return status;
326 }
327
328 Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
329 return fallocate(filename_, hFile_, spaceToReserve);
330 }
331
332 WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
333 size_t allocation_granularity, const EnvOptions& options)
334 : WinFileData(fname, hFile, false),
335 hMap_(NULL),
336 page_size_(page_size),
337 allocation_granularity_(allocation_granularity),
338 reserved_size_(0),
339 mapping_size_(0),
340 view_size_(0),
341 mapped_begin_(nullptr),
342 mapped_end_(nullptr),
343 dst_(nullptr),
344 last_sync_(nullptr),
345 file_offset_(0),
346 pending_sync_(false) {
347 // Allocation granularity must be obtained from GetSystemInfo() and must be
348 // a power of two.
349 assert(allocation_granularity > 0);
350 assert((allocation_granularity & (allocation_granularity - 1)) == 0);
351
352 assert(page_size > 0);
353 assert((page_size & (page_size - 1)) == 0);
354
355 // Only for memory mapped writes
356 assert(options.use_mmap_writes);
357
358 // View size must be both the multiple of allocation_granularity AND the
359 // page size and the granularity is usually a multiple of a page size.
360 const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
361 view_size_ = Roundup(viewSize, allocation_granularity_);
362 }
363
364 WinMmapFile::~WinMmapFile() {
365 if (hFile_) {
366 this->Close();
367 }
368 }
369
370 Status WinMmapFile::Append(const Slice& data) {
371 const char* src = data.data();
372 size_t left = data.size();
373
374 while (left > 0) {
375 assert(mapped_begin_ <= dst_);
376 size_t avail = mapped_end_ - dst_;
377
378 if (avail == 0) {
379 Status s = UnmapCurrentRegion();
380 if (s.ok()) {
381 s = MapNewRegion();
382 }
383
384 if (!s.ok()) {
385 return s;
386 }
387 } else {
388 size_t n = std::min(left, avail);
389 memcpy(dst_, src, n);
390 dst_ += n;
391 src += n;
392 left -= n;
393 pending_sync_ = true;
394 }
395 }
396
397 // Now make sure that the last partial page is padded with zeros if needed
398 size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
399 if (bytesToPad > 0) {
400 memset(dst_, 0, bytesToPad);
401 }
402
403 return Status::OK();
404 }
405
406 // Means Close() will properly take care of truncate
407 // and it does not need any additional information
408 Status WinMmapFile::Truncate(uint64_t size) {
409 return Status::OK();
410 }
411
412 Status WinMmapFile::Close() {
413 Status s;
414
415 assert(NULL != hFile_);
416
417 // We truncate to the precise size so no
418 // uninitialized data at the end. SetEndOfFile
419 // which we use does not write zeros and it is good.
420 uint64_t targetSize = GetFileSize();
421
422 if (mapped_begin_ != nullptr) {
423 // Sync before unmapping to make sure everything
424 // is on disk and there is not a lazy writing
425 // so we are deterministic with the tests
426 Sync();
427 s = UnmapCurrentRegion();
428 }
429
430 if (NULL != hMap_) {
431 BOOL ret = ::CloseHandle(hMap_);
432 if (!ret && s.ok()) {
433 auto lastError = GetLastError();
434 s = IOErrorFromWindowsError(
435 "Failed to Close mapping for file: " + filename_, lastError);
436 }
437
438 hMap_ = NULL;
439 }
440
441 if (hFile_ != NULL) {
442
443 TruncateFile(targetSize);
444
445 BOOL ret = ::CloseHandle(hFile_);
446 hFile_ = NULL;
447
448 if (!ret && s.ok()) {
449 auto lastError = GetLastError();
450 s = IOErrorFromWindowsError(
451 "Failed to close file map handle: " + filename_, lastError);
452 }
453 }
454
455 return s;
456 }
457
458 Status WinMmapFile::Flush() { return Status::OK(); }
459
460 // Flush only data
461 Status WinMmapFile::Sync() {
462 Status s;
463
464 // Some writes occurred since last sync
465 if (dst_ > last_sync_) {
466 assert(mapped_begin_);
467 assert(dst_);
468 assert(dst_ > mapped_begin_);
469 assert(dst_ < mapped_end_);
470
471 size_t page_begin =
472 TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
473 size_t page_end =
474 TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
475
476 // Flush only the amount of that is a multiple of pages
477 if (!::FlushViewOfFile(mapped_begin_ + page_begin,
478 (page_end - page_begin) + page_size_)) {
479 s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
480 GetLastError());
481 } else {
482 last_sync_ = dst_;
483 }
484 }
485
486 return s;
487 }
488
489 /**
490 * Flush data as well as metadata to stable storage.
491 */
492 Status WinMmapFile::Fsync() {
493 Status s = Sync();
494
495 // Flush metadata
496 if (s.ok() && pending_sync_) {
497 if (!::FlushFileBuffers(hFile_)) {
498 s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
499 GetLastError());
500 }
501 pending_sync_ = false;
502 }
503
504 return s;
505 }
506
507 /**
508 * Get the size of valid data in the file. This will not match the
509 * size that is returned from the filesystem because we use mmap
510 * to extend file by map_size every time.
511 */
512 uint64_t WinMmapFile::GetFileSize() {
513 size_t used = dst_ - mapped_begin_;
514 return file_offset_ + used;
515 }
516
517 Status WinMmapFile::InvalidateCache(size_t offset, size_t length) {
518 return Status::OK();
519 }
520
521 Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) {
522 Status status;
523 TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);
524
525 // Make sure that we reserve an aligned amount of space
526 // since the reservation block size is driven outside so we want
527 // to check if we are ok with reservation here
528 size_t spaceToReserve = Roundup(offset + len, view_size_);
529 // Nothing to do
530 if (spaceToReserve <= reserved_size_) {
531 return status;
532 }
533
534 IOSTATS_TIMER_GUARD(allocate_nanos);
535 status = PreallocateInternal(spaceToReserve);
536 if (status.ok()) {
537 reserved_size_ = spaceToReserve;
538 }
539 return status;
540 }
541
542 size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
543 return GetUniqueIdFromFile(hFile_, id, max_size);
544 }
545
546 //////////////////////////////////////////////////////////////////////////////////
547 // WinSequentialFile
548
549 WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
550 const EnvOptions& options)
551 : WinFileData(fname, f, options.use_direct_reads) {}
552
553 WinSequentialFile::~WinSequentialFile() {
554 assert(hFile_ != INVALID_HANDLE_VALUE);
555 }
556
557 Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
558 assert(result != nullptr && !WinFileData::use_direct_io());
559 Status s;
560 size_t r = 0;
561
562 // Windows ReadFile API accepts a DWORD.
563 // While it is possible to read in a loop if n is > UINT_MAX
564 // it is a highly unlikely case.
565 if (n > UINT_MAX) {
566 return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER);
567 }
568
569 DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
570 DWORD bytesRead = 0;
571 BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
572 if (ret == TRUE) {
573 r = bytesRead;
574 } else {
575 return IOErrorFromWindowsError(filename_, GetLastError());
576 }
577
578 *result = Slice(scratch, r);
579
580 return s;
581 }
582
583 SSIZE_T WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
584 uint64_t offset) const {
585 return pread(GetFileHandle(), src, numBytes, offset);
586 }
587
588 Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result,
589 char* scratch) {
590
591 Status s;
592
593 assert(WinFileData::use_direct_io());
594
595 // Windows ReadFile API accepts a DWORD.
596 // While it is possible to read in a loop if n is > UINT_MAX
597 // it is a highly unlikely case.
598 if (n > UINT_MAX) {
599 return IOErrorFromWindowsError(GetName(), ERROR_INVALID_PARAMETER);
600 }
601
602 auto r = PositionedReadInternal(scratch, n, offset);
603
604 if (r < 0) {
605 auto lastError = GetLastError();
606 // Posix impl wants to treat reads from beyond
607 // of the file as OK.
608 if (lastError != ERROR_HANDLE_EOF) {
609 s = IOErrorFromWindowsError(GetName(), lastError);
610 }
611 }
612
613 *result = Slice(scratch, (r < 0) ? 0 : size_t(r));
614 return s;
615 }
616
617
618 Status WinSequentialFile::Skip(uint64_t n) {
619 // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
620 // integer. As such it is a highly unlikley case to have n so large.
621 if (n > _I64_MAX) {
622 return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER);
623 }
624
625 LARGE_INTEGER li;
626 li.QuadPart = static_cast<int64_t>(n); //cast is safe due to the check above
627 BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
628 if (ret == FALSE) {
629 return IOErrorFromWindowsError(filename_, GetLastError());
630 }
631 return Status::OK();
632 }
633
634 Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
635 return Status::OK();
636 }
637
638 //////////////////////////////////////////////////////////////////////////////////////////////////
639 /// WinRandomAccessBase
640
641 // Helper
642 void CalculateReadParameters(size_t alignment, uint64_t offset,
643 size_t bytes_requested,
644 size_t& actual_bytes_toread,
645 uint64_t& first_page_start) {
646
647 first_page_start = TruncateToPageBoundary(alignment, offset);
648 const uint64_t last_page_start =
649 TruncateToPageBoundary(alignment, offset + bytes_requested - 1);
650 actual_bytes_toread = (last_page_start - first_page_start) + alignment;
651 }
652
653 SSIZE_T WinRandomAccessImpl::ReadIntoBuffer(uint64_t user_offset,
654 uint64_t first_page_start,
655 size_t bytes_to_read, size_t& left,
656 AlignedBuffer& buffer, char* dest) const {
657 assert(buffer.CurrentSize() == 0);
658 assert(buffer.Capacity() >= bytes_to_read);
659
660 SSIZE_T read =
661 PositionedReadInternal(buffer.Destination(), bytes_to_read,
662 first_page_start);
663
664 if (read > 0) {
665 buffer.Size(read);
666
667 // Let's figure out how much we read from the users standpoint
668 if ((first_page_start + buffer.CurrentSize()) > user_offset) {
669 assert(first_page_start <= user_offset);
670 size_t buffer_offset = user_offset - first_page_start;
671 read = buffer.Read(dest, buffer_offset, left);
672 } else {
673 read = 0;
674 }
675 left -= read;
676 }
677 return read;
678 }
679
680 SSIZE_T WinRandomAccessImpl::ReadIntoOneShotBuffer(uint64_t user_offset,
681 uint64_t first_page_start,
682 size_t bytes_to_read, size_t& left,
683 char* dest) const {
684 AlignedBuffer bigBuffer;
685 bigBuffer.Alignment(buffer_.Alignment());
686 bigBuffer.AllocateNewBuffer(bytes_to_read);
687
688 return ReadIntoBuffer(user_offset, first_page_start, bytes_to_read, left,
689 bigBuffer, dest);
690 }
691
692 SSIZE_T WinRandomAccessImpl::ReadIntoInstanceBuffer(uint64_t user_offset,
693 uint64_t first_page_start,
694 size_t bytes_to_read, size_t& left,
695 char* dest) const {
696 SSIZE_T read = ReadIntoBuffer(user_offset, first_page_start, bytes_to_read,
697 left, buffer_, dest);
698
699 if (read > 0) {
700 buffered_start_ = first_page_start;
701 }
702
703 return read;
704 }
705
706 SSIZE_T WinRandomAccessImpl::PositionedReadInternal(char* src,
707 size_t numBytes,
708 uint64_t offset) const {
709 return pread(file_base_->GetFileHandle(), src, numBytes, offset);
710 }
711
712 inline
713 WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
714 size_t alignment,
715 const EnvOptions& options) :
716 file_base_(file_base),
717 read_ahead_(false),
718 compaction_readahead_size_(options.compaction_readahead_size),
719 random_access_max_buffer_size_(options.random_access_max_buffer_size),
720 buffer_(),
721 buffered_start_(0) {
722
723 assert(!options.use_mmap_reads);
724
725 // Do not allocate the buffer either until the first request or
726 // until there is a call to allocate a read-ahead buffer
727 buffer_.Alignment(alignment);
728 }
729
730 inline
731 Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
732 char* scratch) const {
733
734 Status s;
735 SSIZE_T r = -1;
736 size_t left = n;
737 char* dest = scratch;
738
739 if (n == 0) {
740 *result = Slice(scratch, 0);
741 return s;
742 }
743
744 // When in direct I/O mode we need to do the following changes:
745 // - use our own aligned buffer
746 // - always read at the offset of that is a multiple of alignment
747 if (file_base_->use_direct_io()) {
748 uint64_t first_page_start = 0;
749 size_t actual_bytes_toread = 0;
750 size_t bytes_requested = left;
751
752 if (!read_ahead_ && random_access_max_buffer_size_ == 0) {
753 CalculateReadParameters(buffer_.Alignment(), offset, bytes_requested,
754 actual_bytes_toread,
755 first_page_start);
756
757 assert(actual_bytes_toread > 0);
758
759 r = ReadIntoOneShotBuffer(offset, first_page_start,
760 actual_bytes_toread, left, dest);
761 } else {
762
763 std::unique_lock<std::mutex> lock(buffer_mut_);
764
765 // Let's see if at least some of the requested data is already
766 // in the buffer
767 if (offset >= buffered_start_ &&
768 offset < (buffered_start_ + buffer_.CurrentSize())) {
769 size_t buffer_offset = offset - buffered_start_;
770 r = buffer_.Read(dest, buffer_offset, left);
771 assert(r >= 0);
772
773 left -= size_t(r);
774 offset += r;
775 dest += r;
776 }
777
778 // Still some left or none was buffered
779 if (left > 0) {
780 // Figure out the start/end offset for reading and amount to read
781 bytes_requested = left;
782
783 if (read_ahead_ && bytes_requested < compaction_readahead_size_) {
784 bytes_requested = compaction_readahead_size_;
785 }
786
787 CalculateReadParameters(buffer_.Alignment(), offset, bytes_requested,
788 actual_bytes_toread,
789 first_page_start);
790
791 assert(actual_bytes_toread > 0);
792
793 if (buffer_.Capacity() < actual_bytes_toread) {
794 // If we are in read-ahead mode or the requested size
795 // exceeds max buffer size then use one-shot
796 // big buffer otherwise reallocate main buffer
797 if (read_ahead_ ||
798 (actual_bytes_toread > random_access_max_buffer_size_)) {
799 // Unlock the mutex since we are not using instance buffer
800 lock.unlock();
801 r = ReadIntoOneShotBuffer(offset, first_page_start,
802 actual_bytes_toread, left, dest);
803 } else {
804 buffer_.AllocateNewBuffer(actual_bytes_toread);
805 r = ReadIntoInstanceBuffer(offset, first_page_start,
806 actual_bytes_toread, left, dest);
807 }
808 } else {
809 buffer_.Clear();
810 r = ReadIntoInstanceBuffer(offset, first_page_start,
811 actual_bytes_toread, left, dest);
812 }
813 }
814 }
815 } else {
816 r = PositionedReadInternal(scratch, left, offset);
817 if (r > 0) {
818 left -= r;
819 }
820 }
821
822 if (r < 0) {
823 auto lastError = GetLastError();
824 // Posix impl wants to treat reads from beyond
825 // of the file as OK.
826 if(lastError != ERROR_HANDLE_EOF) {
827 s = IOErrorFromWindowsError(file_base_->GetName(), lastError);
828 }
829 }
830
831 *result = Slice(scratch, (r < 0) ? 0 : n - left);
832
833 return s;
834 }
835
836 inline
837 void WinRandomAccessImpl::HintImpl(RandomAccessFile::AccessPattern pattern) {
838 if (pattern == RandomAccessFile::SEQUENTIAL && file_base_->use_direct_io() &&
839 compaction_readahead_size_ > 0) {
840 std::lock_guard<std::mutex> lg(buffer_mut_);
841 if (!read_ahead_) {
842 read_ahead_ = true;
843 // This would allocate read-ahead size + 2 alignments
844 // - one for memory alignment which added implicitly by AlignedBuffer
845 // - We add one more alignment because we will read one alignment more
846 // from disk
847 buffer_.AllocateNewBuffer(compaction_readahead_size_ +
848 buffer_.Alignment());
849 }
850 }
851 }
852
853 ///////////////////////////////////////////////////////////////////////////////////////////////////
854 /// WinRandomAccessFile
855
856 WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
857 size_t alignment,
858 const EnvOptions& options)
859 : WinFileData(fname, hFile, options.use_direct_reads),
860 WinRandomAccessImpl(this, alignment, options) {}
861
862 WinRandomAccessFile::~WinRandomAccessFile() {
863 }
864
865 Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
866 char* scratch) const {
867 return ReadImpl(offset, n, result, scratch);
868 }
869
870 void WinRandomAccessFile::EnableReadAhead() {
871 HintImpl(SEQUENTIAL);
872 }
873
874 bool WinRandomAccessFile::ShouldForwardRawRequest() const {
875 return true;
876 }
877
878 void WinRandomAccessFile::Hint(AccessPattern pattern) {
879 HintImpl(pattern);
880 }
881
882 Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
883 return Status::OK();
884 }
885
886 size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
887 return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
888 }
889
890 size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
891 return GetAlignment();
892 }
893
894 /////////////////////////////////////////////////////////////////////////////
895 // WinWritableImpl
896 //
897
898 inline
899 Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
900 return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve);
901 }
902
903 WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
904 : file_data_(file_data),
905 alignment_(alignment),
906 filesize_(0),
907 reservedsize_(0) {
908 }
909
910 Status WinWritableImpl::AppendImpl(const Slice& data) {
911
912 Status s;
913
914 assert(data.size() < std::numeric_limits<DWORD>::max());
915
916 uint64_t written = 0;
917
918 if (file_data_->use_direct_io()) {
919
920 // With no offset specified we are appending
921 // to the end of the file
922
923 assert(IsSectorAligned(filesize_));
924 assert(IsSectorAligned(data.size()));
925 assert(IsAligned(GetAlignement(), data.data()));
926
927 SSIZE_T ret = pwrite(file_data_->GetFileHandle(), data.data(),
928 data.size(), filesize_);
929
930 if (ret < 0) {
931 auto lastError = GetLastError();
932 s = IOErrorFromWindowsError(
933 "Failed to pwrite for: " + file_data_->GetName(), lastError);
934 }
935 else {
936 written = ret;
937 }
938
939 } else {
940
941 DWORD bytesWritten = 0;
942 if (!WriteFile(file_data_->GetFileHandle(), data.data(),
943 static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
944 auto lastError = GetLastError();
945 s = IOErrorFromWindowsError(
946 "Failed to WriteFile: " + file_data_->GetName(),
947 lastError);
948 }
949 else {
950 written = bytesWritten;
951 }
952 }
953
954 if(s.ok()) {
955 assert(written == data.size());
956 filesize_ += data.size();
957 }
958
959 return s;
960 }
961
962 Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) {
963
964 if(file_data_->use_direct_io()) {
965 assert(IsSectorAligned(offset));
966 assert(IsSectorAligned(data.size()));
967 assert(IsAligned(GetAlignement(), data.data()));
968 }
969
970 Status s;
971
972 SSIZE_T ret = pwrite(file_data_->GetFileHandle(), data.data(), data.size(), offset);
973
974 // Error break
975 if (ret < 0) {
976 auto lastError = GetLastError();
977 s = IOErrorFromWindowsError(
978 "Failed to pwrite for: " + file_data_->GetName(), lastError);
979 }
980 else {
981 assert(size_t(ret) == data.size());
982 // For sequential write this would be simple
983 // size extension by data.size()
984 uint64_t write_end = offset + data.size();
985 if (write_end >= filesize_) {
986 filesize_ = write_end;
987 }
988 }
989 return s;
990 }
991
992 // Need to implement this so the file is truncated correctly
993 // when buffered and unbuffered mode
994 inline
995 Status WinWritableImpl::TruncateImpl(uint64_t size) {
996 Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(),
997 size);
998 if (s.ok()) {
999 filesize_ = size;
1000 }
1001 return s;
1002 }
1003
1004 Status WinWritableImpl::CloseImpl() {
1005
1006 Status s;
1007
1008 auto hFile = file_data_->GetFileHandle();
1009 assert(INVALID_HANDLE_VALUE != hFile);
1010
1011 if (fsync(hFile) < 0) {
1012 auto lastError = GetLastError();
1013 s = IOErrorFromWindowsError("fsync failed at Close() for: " +
1014 file_data_->GetName(),
1015 lastError);
1016 }
1017
1018 if(!file_data_->CloseFile()) {
1019 auto lastError = GetLastError();
1020 s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(),
1021 lastError);
1022 }
1023 return s;
1024 }
1025
1026 Status WinWritableImpl::SyncImpl() {
1027 Status s;
1028 // Calls flush buffers
1029 if (fsync(file_data_->GetFileHandle()) < 0) {
1030 auto lastError = GetLastError();
1031 s = IOErrorFromWindowsError(
1032 "fsync failed at Sync() for: " + file_data_->GetName(), lastError);
1033 }
1034 return s;
1035 }
1036
1037
1038 Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
1039 Status status;
1040 TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds);
1041
1042 // Make sure that we reserve an aligned amount of space
1043 // since the reservation block size is driven outside so we want
1044 // to check if we are ok with reservation here
1045 size_t spaceToReserve = Roundup(offset + len, alignment_);
1046 // Nothing to do
1047 if (spaceToReserve <= reservedsize_) {
1048 return status;
1049 }
1050
1051 IOSTATS_TIMER_GUARD(allocate_nanos);
1052 status = PreallocateInternal(spaceToReserve);
1053 if (status.ok()) {
1054 reservedsize_ = spaceToReserve;
1055 }
1056 return status;
1057 }
1058
1059
1060 ////////////////////////////////////////////////////////////////////////////////
1061 /// WinWritableFile
1062
1063 WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
1064 size_t alignment, size_t /* capacity */,
1065 const EnvOptions& options)
1066 : WinFileData(fname, hFile, options.use_direct_writes),
1067 WinWritableImpl(this, alignment) {
1068 assert(!options.use_mmap_writes);
1069 }
1070
1071 WinWritableFile::~WinWritableFile() {
1072 }
1073
1074 // Indicates if the class makes use of direct I/O
1075 bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); }
1076
1077 size_t WinWritableFile::GetRequiredBufferAlignment() const {
1078 return GetAlignement();
1079 }
1080
1081 Status WinWritableFile::Append(const Slice& data) {
1082 return AppendImpl(data);
1083 }
1084
1085 Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
1086 return PositionedAppendImpl(data, offset);
1087 }
1088
1089 // Need to implement this so the file is truncated correctly
1090 // when buffered and unbuffered mode
1091 Status WinWritableFile::Truncate(uint64_t size) {
1092 return TruncateImpl(size);
1093 }
1094
1095 Status WinWritableFile::Close() {
1096 return CloseImpl();
1097 }
1098
1099 // write out the cached data to the OS cache
1100 // This is now taken care of the WritableFileWriter
1101 Status WinWritableFile::Flush() {
1102 return Status::OK();
1103 }
1104
1105 Status WinWritableFile::Sync() {
1106 return SyncImpl();
1107 }
1108
1109 Status WinWritableFile::Fsync() { return SyncImpl(); }
1110
1111 uint64_t WinWritableFile::GetFileSize() {
1112 return GetFileSizeImpl();
1113 }
1114
1115 Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) {
1116 return AllocateImpl(offset, len);
1117 }
1118
1119 size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
1120 return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
1121 }
1122
1123 /////////////////////////////////////////////////////////////////////////
1124 /// WinRandomRWFile
1125
1126 WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
1127 size_t alignment, const EnvOptions& options)
1128 : WinFileData(fname, hFile,
1129 options.use_direct_reads && options.use_direct_writes),
1130 WinRandomAccessImpl(this, alignment, options),
1131 WinWritableImpl(this, alignment) {}
1132
1133 bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); }
1134
1135 size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
1136 return GetAlignement();
1137 }
1138
1139 bool WinRandomRWFile::ShouldForwardRawRequest() const {
1140 return true;
1141 }
1142
1143 void WinRandomRWFile::EnableReadAhead() {
1144 HintImpl(RandomAccessFile::SEQUENTIAL);
1145 }
1146
1147 Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
1148 return PositionedAppendImpl(data, offset);
1149 }
1150
1151 Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
1152 char* scratch) const {
1153 return ReadImpl(offset, n, result, scratch);
1154 }
1155
1156 Status WinRandomRWFile::Flush() {
1157 return Status::OK();
1158 }
1159
1160 Status WinRandomRWFile::Sync() {
1161 return SyncImpl();
1162 }
1163
1164 Status WinRandomRWFile::Close() {
1165 return CloseImpl();
1166 }
1167
1168 //////////////////////////////////////////////////////////////////////////
1169 /// WinDirectory
1170
1171 Status WinDirectory::Fsync() { return Status::OK(); }
1172
1173 //////////////////////////////////////////////////////////////////////////
1174 /// WinFileLock
1175
1176 WinFileLock::~WinFileLock() {
1177 BOOL ret = ::CloseHandle(hFile_);
1178 assert(ret);
1179 }
1180
1181 }
1182 }