]> git.proxmox.com Git - ceph.git/blame - ceph/src/rocksdb/port/win/io_win.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / rocksdb / port / win / io_win.cc
CommitLineData
7c673cae 1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
11fdf7f2
TL
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
7c673cae
FG
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
20effc67
TL
10#if defined(OS_WIN)
11
7c673cae
FG
12#include "port/win/io_win.h"
13
1e59de90 14#include "env_win.h"
7c673cae 15#include "monitoring/iostats_context_imp.h"
f67539c2 16#include "test_util/sync_point.h"
7c673cae
FG
17#include "util/aligned_buffer.h"
18#include "util/coding.h"
7c673cae 19
f67539c2 20namespace ROCKSDB_NAMESPACE {
7c673cae
FG
21namespace port {
22
23/*
1e59de90
TL
24 * DirectIOHelper
25 */
7c673cae
FG
26namespace {
27
28const size_t kSectorSize = 512;
29
1e59de90 30inline bool IsPowerOfTwo(const size_t alignment) {
7c673cae
FG
31 return ((alignment) & (alignment - 1)) == 0;
32}
33
1e59de90 34inline bool IsAligned(size_t alignment, const void* ptr) {
7c673cae
FG
35 return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
36}
1e59de90 37} // namespace
7c673cae
FG
38
39std::string GetWindowsErrSz(DWORD err) {
1e59de90
TL
40 std::string Err;
41 LPSTR lpMsgBuf = nullptr;
7c673cae 42 FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
1e59de90
TL
43 FORMAT_MESSAGE_IGNORE_INSERTS,
44 NULL, err,
45 0, // Default language
46 reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
47
48 if (lpMsgBuf) {
49 Err = lpMsgBuf;
50 LocalFree(lpMsgBuf);
51 }
7c673cae
FG
52 return Err;
53}
54
55// We preserve the original name of this interface to denote the original idea
56// behind it.
57// All reads happen by a specified offset and pwrite interface does not change
58// the position of the file pointer. Judging from the man page and errno it does
59// execute
60// lseek atomically to return the position of the file back where it was.
61// WriteFile() does not
62// have this capability. Therefore, for both pread and pwrite the pointer is
63// advanced to the next position
64// which is fine for writes because they are (should be) sequential.
65// Because all the reads/writes happen by the specified offset, the caller in
66// theory should not
67// rely on the current file offset.
1e59de90
TL
68IOStatus pwrite(const WinFileData* file_data, const Slice& data,
69 uint64_t offset, size_t& bytes_written) {
70 IOStatus s;
11fdf7f2
TL
71 bytes_written = 0;
72
73 size_t num_bytes = data.size();
74 if (num_bytes > std::numeric_limits<DWORD>::max()) {
75 // May happen in 64-bit builds where size_t is 64-bits but
76 // long is still 32-bit, but that's the API here at the moment
1e59de90
TL
77 return IOStatus::InvalidArgument(
78 "num_bytes is too large for a single write: " + file_data->GetName());
11fdf7f2
TL
79 }
80
1e59de90 81 OVERLAPPED overlapped = {0};
7c673cae
FG
82 ULARGE_INTEGER offsetUnion;
83 offsetUnion.QuadPart = offset;
84
85 overlapped.Offset = offsetUnion.LowPart;
86 overlapped.OffsetHigh = offsetUnion.HighPart;
87
11fdf7f2 88 DWORD bytesWritten = 0;
7c673cae 89
1e59de90
TL
90 if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(),
91 static_cast<DWORD>(num_bytes), &bytesWritten,
92 &overlapped)) {
11fdf7f2
TL
93 auto lastError = GetLastError();
94 s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
1e59de90 95 lastError);
7c673cae 96 } else {
11fdf7f2 97 bytes_written = bytesWritten;
7c673cae
FG
98 }
99
11fdf7f2 100 return s;
7c673cae
FG
101}
102
103// See comments for pwrite above
1e59de90
TL
104IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
105 uint64_t offset, size_t& bytes_read) {
106 IOStatus s;
11fdf7f2
TL
107 bytes_read = 0;
108
109 if (num_bytes > std::numeric_limits<DWORD>::max()) {
1e59de90
TL
110 return IOStatus::InvalidArgument(
111 "num_bytes is too large for a single read: " + file_data->GetName());
11fdf7f2
TL
112 }
113
1e59de90 114 OVERLAPPED overlapped = {0};
7c673cae
FG
115 ULARGE_INTEGER offsetUnion;
116 offsetUnion.QuadPart = offset;
117
118 overlapped.Offset = offsetUnion.LowPart;
119 overlapped.OffsetHigh = offsetUnion.HighPart;
120
11fdf7f2 121 DWORD bytesRead = 0;
7c673cae 122
1e59de90
TL
123 if (FALSE == ReadFile(file_data->GetFileHandle(), src,
124 static_cast<DWORD>(num_bytes), &bytesRead,
125 &overlapped)) {
11fdf7f2
TL
126 auto lastError = GetLastError();
127 // EOF is OK with zero bytes read
128 if (lastError != ERROR_HANDLE_EOF) {
129 s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
1e59de90 130 lastError);
11fdf7f2 131 }
7c673cae 132 } else {
11fdf7f2 133 bytes_read = bytesRead;
7c673cae
FG
134 }
135
11fdf7f2 136 return s;
7c673cae
FG
137}
138
139// SetFileInformationByHandle() is capable of fast pre-allocates.
140// However, this does not change the file end position unless the file is
141// truncated and the pre-allocated space is not considered filled with zeros.
1e59de90
TL
142IOStatus fallocate(const std::string& filename, HANDLE hFile,
143 uint64_t to_size) {
144 IOStatus status;
7c673cae
FG
145
146 FILE_ALLOCATION_INFO alloc_info;
147 alloc_info.AllocationSize.QuadPart = to_size;
148
149 if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
1e59de90 150 sizeof(FILE_ALLOCATION_INFO))) {
7c673cae
FG
151 auto lastError = GetLastError();
152 status = IOErrorFromWindowsError(
1e59de90 153 "Failed to pre-allocate space: " + filename, lastError);
7c673cae
FG
154 }
155
156 return status;
157}
158
1e59de90
TL
159IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize) {
160 IOStatus status;
7c673cae
FG
161
162 FILE_END_OF_FILE_INFO end_of_file;
163 end_of_file.EndOfFile.QuadPart = toSize;
164
165 if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
1e59de90 166 sizeof(FILE_END_OF_FILE_INFO))) {
7c673cae
FG
167 auto lastError = GetLastError();
168 status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
1e59de90 169 lastError);
7c673cae
FG
170 }
171
172 return status;
173}
174
f67539c2
TL
175size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
176 size_t /*max_size*/) {
177 // Returning 0 is safe as it causes the table reader to generate a unique ID.
178 // This is suboptimal for performance as it prevents multiple table readers
179 // for the same file from sharing cached blocks. For example, if users have
180 // a low value for `max_open_files`, there can be many table readers opened
181 // for the same file.
182 //
183 // TODO: this is a temporarily solution as it is safe but not optimal for
184 // performance. For more details see discussion in
185 // https://github.com/facebook/rocksdb/pull/5844.
186 return 0;
7c673cae
FG
187}
188
1e59de90
TL
189WinFileData::WinFileData(const std::string& filename, HANDLE hFile,
190 bool direct_io)
191 : filename_(filename),
192 hFile_(hFile),
193 use_direct_io_(direct_io),
194 sector_size_(WinFileSystem::GetSectorSize(filename)) {}
195
196bool WinFileData::IsSectorAligned(const size_t off) const {
197 return (off & (sector_size_ - 1)) == 0;
198}
199
7c673cae
FG
200////////////////////////////////////////////////////////////////////////////////////////////////////
201// WinMmapReadableFile
202
203WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
204 HANDLE hFile, HANDLE hMap,
205 const void* mapped_region,
206 size_t length)
207 : WinFileData(fileName, hFile, false /* use_direct_io */),
208 hMap_(hMap),
209 mapped_region_(mapped_region),
210 length_(length) {}
211
212WinMmapReadableFile::~WinMmapReadableFile() {
11fdf7f2
TL
213 BOOL ret __attribute__((__unused__));
214 ret = ::UnmapViewOfFile(mapped_region_);
7c673cae
FG
215 assert(ret);
216
217 ret = ::CloseHandle(hMap_);
218 assert(ret);
219}
220
1e59de90
TL
221IOStatus WinMmapReadableFile::Read(uint64_t offset, size_t n,
222 const IOOptions& /*options*/, Slice* result,
223 char* scratch,
224 IODebugContext* /*dbg*/) const {
225 IOStatus s;
7c673cae
FG
226
227 if (offset > length_) {
228 *result = Slice();
229 return IOError(filename_, EINVAL);
230 } else if (offset + n > length_) {
11fdf7f2 231 n = length_ - static_cast<size_t>(offset);
7c673cae 232 }
1e59de90 233 *result = Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
7c673cae
FG
234 return s;
235}
236
1e59de90
TL
237IOStatus WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
238 return IOStatus::OK();
7c673cae
FG
239}
240
241size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
242 return GetUniqueIdFromFile(hFile_, id, max_size);
243}
244
245///////////////////////////////////////////////////////////////////////////////
246/// WinMmapFile
247
7c673cae
FG
248// Can only truncate or reserve to a sector size aligned if
249// used on files that are opened with Unbuffered I/O
1e59de90 250IOStatus WinMmapFile::TruncateFile(uint64_t toSize) {
7c673cae
FG
251 return ftruncate(filename_, hFile_, toSize);
252}
253
1e59de90
TL
254IOStatus WinMmapFile::UnmapCurrentRegion() {
255 IOStatus status;
7c673cae
FG
256
257 if (mapped_begin_ != nullptr) {
258 if (!::UnmapViewOfFile(mapped_begin_)) {
259 status = IOErrorFromWindowsError(
1e59de90 260 "Failed to unmap file view: " + filename_, GetLastError());
7c673cae
FG
261 }
262
263 // Move on to the next portion of the file
264 file_offset_ += view_size_;
265
266 // UnmapView automatically sends data to disk but not the metadata
267 // which is good and provides some equivalent of fdatasync() on Linux
268 // therefore, we donot need separate flag for metadata
269 mapped_begin_ = nullptr;
270 mapped_end_ = nullptr;
271 dst_ = nullptr;
272
273 last_sync_ = nullptr;
274 pending_sync_ = false;
275 }
276
277 return status;
278}
279
1e59de90
TL
280IOStatus WinMmapFile::MapNewRegion(const IOOptions& options,
281 IODebugContext* dbg) {
282 IOStatus status;
7c673cae
FG
283
284 assert(mapped_begin_ == nullptr);
285
11fdf7f2 286 size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
7c673cae
FG
287
288 if (minDiskSize > reserved_size_) {
1e59de90 289 status = Allocate(file_offset_, view_size_, options, dbg);
7c673cae
FG
290 if (!status.ok()) {
291 return status;
292 }
293 }
294
295 // Need to remap
296 if (hMap_ == NULL || reserved_size_ > mapping_size_) {
7c673cae
FG
297 if (hMap_ != NULL) {
298 // Unmap the previous one
11fdf7f2
TL
299 BOOL ret __attribute__((__unused__));
300 ret = ::CloseHandle(hMap_);
7c673cae
FG
301 assert(ret);
302 hMap_ = NULL;
303 }
304
305 ULARGE_INTEGER mappingSize;
306 mappingSize.QuadPart = reserved_size_;
307
308 hMap_ = CreateFileMappingA(
1e59de90
TL
309 hFile_,
310 NULL, // Security attributes
311 PAGE_READWRITE, // There is not a write only mode for mapping
312 mappingSize.HighPart, // Enable mapping the whole file but the actual
313 // amount mapped is determined by MapViewOfFile
314 mappingSize.LowPart,
315 NULL); // Mapping name
7c673cae
FG
316
317 if (NULL == hMap_) {
318 return IOErrorFromWindowsError(
1e59de90
TL
319 "WindowsMmapFile failed to create file mapping for: " + filename_,
320 GetLastError());
7c673cae
FG
321 }
322
323 mapping_size_ = reserved_size_;
324 }
325
326 ULARGE_INTEGER offset;
327 offset.QuadPart = file_offset_;
328
329 // View must begin at the granularity aligned offset
330 mapped_begin_ = reinterpret_cast<char*>(
1e59de90
TL
331 MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
332 view_size_, NULL));
7c673cae
FG
333
334 if (!mapped_begin_) {
335 status = IOErrorFromWindowsError(
1e59de90
TL
336 "WindowsMmapFile failed to map file view: " + filename_,
337 GetLastError());
7c673cae
FG
338 } else {
339 mapped_end_ = mapped_begin_ + view_size_;
340 dst_ = mapped_begin_;
341 last_sync_ = mapped_begin_;
342 pending_sync_ = false;
343 }
344 return status;
345}
346
1e59de90 347IOStatus WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
7c673cae
FG
348 return fallocate(filename_, hFile_, spaceToReserve);
349}
350
f67539c2
TL
351WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
352 size_t page_size, size_t allocation_granularity,
1e59de90 353 const FileOptions& options)
f67539c2 354 : WinFileData(fname, hFile, false),
1e59de90 355 FSWritableFile(options),
f67539c2
TL
356 hMap_(NULL),
357 page_size_(page_size),
358 allocation_granularity_(allocation_granularity),
359 reserved_size_(0),
360 mapping_size_(0),
361 view_size_(0),
362 mapped_begin_(nullptr),
363 mapped_end_(nullptr),
364 dst_(nullptr),
365 last_sync_(nullptr),
366 file_offset_(0),
367 pending_sync_(false) {
7c673cae
FG
368 // Allocation granularity must be obtained from GetSystemInfo() and must be
369 // a power of two.
370 assert(allocation_granularity > 0);
371 assert((allocation_granularity & (allocation_granularity - 1)) == 0);
372
373 assert(page_size > 0);
374 assert((page_size & (page_size - 1)) == 0);
375
376 // Only for memory mapped writes
377 assert(options.use_mmap_writes);
378
379 // View size must be both the multiple of allocation_granularity AND the
380 // page size and the granularity is usually a multiple of a page size.
1e59de90
TL
381 const size_t viewSize =
382 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
7c673cae
FG
383 view_size_ = Roundup(viewSize, allocation_granularity_);
384}
385
386WinMmapFile::~WinMmapFile() {
387 if (hFile_) {
1e59de90 388 this->Close(IOOptions(), nullptr);
7c673cae
FG
389 }
390}
391
1e59de90
TL
392IOStatus WinMmapFile::Append(const Slice& data, const IOOptions& options,
393 IODebugContext* dbg) {
7c673cae
FG
394 const char* src = data.data();
395 size_t left = data.size();
396
397 while (left > 0) {
398 assert(mapped_begin_ <= dst_);
399 size_t avail = mapped_end_ - dst_;
400
401 if (avail == 0) {
1e59de90 402 IOStatus s = UnmapCurrentRegion();
7c673cae 403 if (s.ok()) {
1e59de90 404 s = MapNewRegion(options, dbg);
7c673cae
FG
405 }
406
407 if (!s.ok()) {
408 return s;
409 }
410 } else {
411 size_t n = std::min(left, avail);
412 memcpy(dst_, src, n);
413 dst_ += n;
414 src += n;
415 left -= n;
416 pending_sync_ = true;
417 }
418 }
419
420 // Now make sure that the last partial page is padded with zeros if needed
421 size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
422 if (bytesToPad > 0) {
423 memset(dst_, 0, bytesToPad);
424 }
425
1e59de90 426 return IOStatus::OK();
7c673cae
FG
427}
428
429// Means Close() will properly take care of truncate
430// and it does not need any additional information
1e59de90
TL
431IOStatus WinMmapFile::Truncate(uint64_t size, const IOOptions& /*options*/,
432 IODebugContext* /*dbg*/) {
433 return IOStatus::OK();
7c673cae
FG
434}
435
1e59de90
TL
436IOStatus WinMmapFile::Close(const IOOptions& options, IODebugContext* dbg) {
437 IOStatus s;
7c673cae
FG
438
439 assert(NULL != hFile_);
440
441 // We truncate to the precise size so no
442 // uninitialized data at the end. SetEndOfFile
443 // which we use does not write zeros and it is good.
1e59de90 444 uint64_t targetSize = GetFileSize(options, dbg);
7c673cae
FG
445
446 if (mapped_begin_ != nullptr) {
447 // Sync before unmapping to make sure everything
448 // is on disk and there is not a lazy writing
449 // so we are deterministic with the tests
1e59de90 450 Sync(options, dbg);
7c673cae
FG
451 s = UnmapCurrentRegion();
452 }
453
454 if (NULL != hMap_) {
455 BOOL ret = ::CloseHandle(hMap_);
456 if (!ret && s.ok()) {
457 auto lastError = GetLastError();
458 s = IOErrorFromWindowsError(
1e59de90 459 "Failed to Close mapping for file: " + filename_, lastError);
7c673cae
FG
460 }
461
462 hMap_ = NULL;
463 }
464
465 if (hFile_ != NULL) {
7c673cae
FG
466 TruncateFile(targetSize);
467
468 BOOL ret = ::CloseHandle(hFile_);
469 hFile_ = NULL;
470
471 if (!ret && s.ok()) {
472 auto lastError = GetLastError();
473 s = IOErrorFromWindowsError(
1e59de90 474 "Failed to close file map handle: " + filename_, lastError);
7c673cae
FG
475 }
476 }
477
478 return s;
479}
480
1e59de90
TL
481IOStatus WinMmapFile::Flush(const IOOptions& /*options*/,
482 IODebugContext* /*dbg*/) {
483 return IOStatus::OK();
484}
7c673cae
FG
485
486// Flush only data
1e59de90
TL
487IOStatus WinMmapFile::Sync(const IOOptions& /*options*/,
488 IODebugContext* /*dbg*/) {
489 IOStatus s;
7c673cae
FG
490
491 // Some writes occurred since last sync
492 if (dst_ > last_sync_) {
493 assert(mapped_begin_);
494 assert(dst_);
495 assert(dst_ > mapped_begin_);
496 assert(dst_ < mapped_end_);
497
498 size_t page_begin =
1e59de90 499 TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
7c673cae 500 size_t page_end =
1e59de90 501 TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
7c673cae
FG
502
503 // Flush only the amount of that is a multiple of pages
504 if (!::FlushViewOfFile(mapped_begin_ + page_begin,
1e59de90 505 (page_end - page_begin) + page_size_)) {
7c673cae 506 s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
1e59de90 507 GetLastError());
7c673cae
FG
508 } else {
509 last_sync_ = dst_;
510 }
511 }
512
513 return s;
514}
515
516/**
1e59de90
TL
517 * Flush data as well as metadata to stable storage.
518 */
519IOStatus WinMmapFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
520 IOStatus s = Sync(options, dbg);
7c673cae
FG
521
522 // Flush metadata
523 if (s.ok() && pending_sync_) {
524 if (!::FlushFileBuffers(hFile_)) {
525 s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
1e59de90 526 GetLastError());
7c673cae
FG
527 }
528 pending_sync_ = false;
529 }
530
531 return s;
532}
533
534/**
1e59de90
TL
535 * Get the size of valid data in the file. This will not match the
536 * size that is returned from the filesystem because we use mmap
537 * to extend file by map_size every time.
538 */
539uint64_t WinMmapFile::GetFileSize(const IOOptions& /*options*/,
540 IODebugContext* /*dbg*/) {
7c673cae
FG
541 size_t used = dst_ - mapped_begin_;
542 return file_offset_ + used;
543}
544
1e59de90
TL
545IOStatus WinMmapFile::InvalidateCache(size_t offset, size_t length) {
546 return IOStatus::OK();
7c673cae
FG
547}
548
1e59de90
TL
549IOStatus WinMmapFile::Allocate(uint64_t offset, uint64_t len,
550 const IOOptions& /*options*/,
551 IODebugContext* /*dbg*/) {
552 IOStatus status;
553 TEST_KILL_RANDOM("WinMmapFile::Allocate");
7c673cae
FG
554
555 // Make sure that we reserve an aligned amount of space
556 // since the reservation block size is driven outside so we want
557 // to check if we are ok with reservation here
1e59de90
TL
558 size_t spaceToReserve =
559 Roundup(static_cast<size_t>(offset + len), view_size_);
7c673cae
FG
560 // Nothing to do
561 if (spaceToReserve <= reserved_size_) {
562 return status;
563 }
564
565 IOSTATS_TIMER_GUARD(allocate_nanos);
566 status = PreallocateInternal(spaceToReserve);
567 if (status.ok()) {
568 reserved_size_ = spaceToReserve;
569 }
570 return status;
571}
572
573size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
574 return GetUniqueIdFromFile(hFile_, id, max_size);
575}
576
577//////////////////////////////////////////////////////////////////////////////////
578// WinSequentialFile
579
580WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
1e59de90 581 const FileOptions& options)
7c673cae
FG
582 : WinFileData(fname, f, options.use_direct_reads) {}
583
584WinSequentialFile::~WinSequentialFile() {
585 assert(hFile_ != INVALID_HANDLE_VALUE);
586}
587
1e59de90
TL
588IOStatus WinSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
589 Slice* result, char* scratch,
590 IODebugContext* /*dbg*/) {
591 IOStatus s;
7c673cae
FG
592 size_t r = 0;
593
11fdf7f2
TL
594 assert(result != nullptr);
595 if (WinFileData::use_direct_io()) {
1e59de90 596 return IOStatus::NotSupported("Read() does not support direct_io");
11fdf7f2
TL
597 }
598
7c673cae 599 // Windows ReadFile API accepts a DWORD.
11fdf7f2
TL
600 // While it is possible to read in a loop if n is too big
601 // it is an unlikely case.
602 if (n > std::numeric_limits<DWORD>::max()) {
1e59de90
TL
603 return IOStatus::InvalidArgument("n is too big for a single ReadFile: " +
604 filename_);
7c673cae
FG
605 }
606
1e59de90
TL
607 DWORD bytesToRead =
608 static_cast<DWORD>(n); // cast is safe due to the check above
7c673cae
FG
609 DWORD bytesRead = 0;
610 BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
11fdf7f2 611 if (ret != FALSE) {
7c673cae
FG
612 r = bytesRead;
613 } else {
11fdf7f2
TL
614 auto lastError = GetLastError();
615 if (lastError != ERROR_HANDLE_EOF) {
1e59de90 616 s = IOErrorFromWindowsError("ReadFile failed: " + filename_, lastError);
11fdf7f2 617 }
7c673cae
FG
618 }
619
620 *result = Slice(scratch, r);
7c673cae
FG
621 return s;
622}
623
1e59de90
TL
624IOStatus WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
625 uint64_t offset,
626 size_t& bytes_read) const {
11fdf7f2 627 return pread(this, src, numBytes, offset, bytes_read);
7c673cae
FG
628}
629
1e59de90
TL
630IOStatus WinSequentialFile::PositionedRead(uint64_t offset, size_t n,
631 const IOOptions& /*opts*/,
632 Slice* result, char* scratch,
633 IODebugContext* /*dbg*/) {
11fdf7f2 634 if (!WinFileData::use_direct_io()) {
1e59de90 635 return IOStatus::NotSupported("This function is only used for direct_io");
7c673cae
FG
636 }
637
1e59de90
TL
638 assert(IsSectorAligned(static_cast<size_t>(offset)));
639 assert(IsSectorAligned(static_cast<size_t>(n)));
7c673cae 640
1e59de90
TL
641 size_t bytes_read = 0; // out param
642 IOStatus s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset,
643 bytes_read);
11fdf7f2 644 *result = Slice(scratch, bytes_read);
7c673cae
FG
645 return s;
646}
647
1e59de90
TL
648IOStatus WinSequentialFile::Skip(uint64_t n) {
649 // Can't handle more than signed max as SetFilePointerEx accepts a signed
650 // 64-bit integer. As such it is a highly unlikley case to have n so large.
11fdf7f2 651 if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
1e59de90
TL
652 return IOStatus::InvalidArgument(
653 "n is too large for a single SetFilePointerEx() call" + filename_);
7c673cae
FG
654 }
655
656 LARGE_INTEGER li;
1e59de90
TL
657 li.QuadPart = static_cast<LONGLONG>(n); // cast is safe due to the check
658 // above
7c673cae
FG
659 BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
660 if (ret == FALSE) {
11fdf7f2 661 auto lastError = GetLastError();
20effc67
TL
662 return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
663 lastError);
7c673cae 664 }
1e59de90 665 return IOStatus::OK();
7c673cae
FG
666}
667
1e59de90
TL
668IOStatus WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
669 return IOStatus::OK();
7c673cae
FG
670}
671
672//////////////////////////////////////////////////////////////////////////////////////////////////
673/// WinRandomAccessBase
674
1e59de90
TL
675inline IOStatus WinRandomAccessImpl::PositionedReadInternal(
676 char* src, size_t numBytes, uint64_t offset, size_t& bytes_read) const {
11fdf7f2 677 return pread(file_base_, src, numBytes, offset, bytes_read);
7c673cae
FG
678}
679
1e59de90
TL
680inline WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
681 size_t alignment,
682 const FileOptions& options)
683 : file_base_(file_base),
684 alignment_(std::max(alignment, file_base->GetSectorSize())) {
7c673cae 685 assert(!options.use_mmap_reads);
7c673cae
FG
686}
687
1e59de90
TL
688inline IOStatus WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n,
689 Slice* result,
690 char* scratch) const {
11fdf7f2 691 // Check buffer alignment
7c673cae 692 if (file_base_->use_direct_io()) {
1e59de90
TL
693 assert(file_base_->IsSectorAligned(static_cast<size_t>(offset)));
694 assert(IsAligned(alignment_, scratch));
7c673cae
FG
695 }
696
11fdf7f2
TL
697 if (n == 0) {
698 *result = Slice(scratch, 0);
1e59de90 699 return IOStatus::OK();
7c673cae
FG
700 }
701
11fdf7f2 702 size_t bytes_read = 0;
1e59de90 703 IOStatus s = PositionedReadInternal(scratch, n, offset, bytes_read);
11fdf7f2 704 *result = Slice(scratch, bytes_read);
7c673cae
FG
705 return s;
706}
707
7c673cae
FG
708///////////////////////////////////////////////////////////////////////////////////////////////////
709/// WinRandomAccessFile
710
711WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
712 size_t alignment,
1e59de90 713 const FileOptions& options)
7c673cae
FG
714 : WinFileData(fname, hFile, options.use_direct_reads),
715 WinRandomAccessImpl(this, alignment, options) {}
716
1e59de90 717WinRandomAccessFile::~WinRandomAccessFile() {}
7c673cae 718
1e59de90
TL
719IOStatus WinRandomAccessFile::Read(uint64_t offset, size_t n,
720 const IOOptions& /*options*/, Slice* result,
721 char* scratch,
722 IODebugContext* /*dbg*/) const {
7c673cae
FG
723 return ReadImpl(offset, n, result, scratch);
724}
725
1e59de90
TL
726IOStatus WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
727 return IOStatus::OK();
7c673cae
FG
728}
729
730size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
731 return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
732}
733
734size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
735 return GetAlignment();
736}
737
738/////////////////////////////////////////////////////////////////////////////
739// WinWritableImpl
740//
741
1e59de90
TL
742inline IOStatus WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
743 return fallocate(file_data_->GetName(), file_data_->GetFileHandle(),
744 spaceToReserve);
7c673cae
FG
745}
746
1e59de90
TL
747inline WinWritableImpl::WinWritableImpl(WinFileData* file_data,
748 size_t alignment)
749 : file_data_(file_data),
750 alignment_(std::max(alignment, file_data->GetSectorSize())),
751 next_write_offset_(0),
752 reservedsize_(0) {
11fdf7f2
TL
753 // Query current position in case ReopenWritableFile is called
754 // This position is only important for buffered writes
755 // for unbuffered writes we explicitely specify the position.
756 LARGE_INTEGER zero_move;
1e59de90 757 zero_move.QuadPart = 0; // Do not move
11fdf7f2
TL
758 LARGE_INTEGER pos;
759 pos.QuadPart = 0;
760 BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
1e59de90 761 FILE_CURRENT);
11fdf7f2
TL
762 // Querying no supped to fail
763 if (ret != 0) {
764 next_write_offset_ = pos.QuadPart;
765 } else {
766 assert(false);
767 }
7c673cae
FG
768}
769
1e59de90
TL
770inline IOStatus WinWritableImpl::AppendImpl(const Slice& data) {
771 IOStatus s;
7c673cae 772
11fdf7f2 773 if (data.size() > std::numeric_limits<DWORD>::max()) {
1e59de90
TL
774 return IOStatus::InvalidArgument("data is too long for a single write" +
775 file_data_->GetName());
11fdf7f2 776 }
7c673cae 777
1e59de90 778 size_t bytes_written = 0; // out param
7c673cae
FG
779
780 if (file_data_->use_direct_io()) {
7c673cae
FG
781 // With no offset specified we are appending
782 // to the end of the file
1e59de90
TL
783 assert(file_data_->IsSectorAligned(next_write_offset_));
784 assert(file_data_->IsSectorAligned(data.size()));
785 assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
786 s = pwrite(file_data_, data, next_write_offset_, bytes_written);
7c673cae 787 } else {
7c673cae
FG
788 DWORD bytesWritten = 0;
789 if (!WriteFile(file_data_->GetFileHandle(), data.data(),
1e59de90 790 static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
7c673cae
FG
791 auto lastError = GetLastError();
792 s = IOErrorFromWindowsError(
1e59de90 793 "Failed to WriteFile: " + file_data_->GetName(), lastError);
11fdf7f2
TL
794 } else {
795 bytes_written = bytesWritten;
7c673cae
FG
796 }
797 }
798
1e59de90 799 if (s.ok()) {
11fdf7f2
TL
800 if (bytes_written == data.size()) {
801 // This matters for direct_io cases where
802 // we rely on the fact that next_write_offset_
803 // is sector aligned
804 next_write_offset_ += bytes_written;
805 } else {
1e59de90
TL
806 s = IOStatus::IOError("Failed to write all bytes: " +
807 file_data_->GetName());
11fdf7f2 808 }
7c673cae
FG
809 }
810
811 return s;
812}
813
1e59de90
TL
814inline IOStatus WinWritableImpl::PositionedAppendImpl(const Slice& data,
815 uint64_t offset) {
816 if (file_data_->use_direct_io()) {
817 assert(file_data_->IsSectorAligned(static_cast<size_t>(offset)));
818 assert(file_data_->IsSectorAligned(data.size()));
819 assert(IsAligned(static_cast<size_t>(GetAlignment()), data.data()));
7c673cae
FG
820 }
821
11fdf7f2 822 size_t bytes_written = 0;
1e59de90 823 IOStatus s = pwrite(file_data_, data, offset, bytes_written);
7c673cae 824
1e59de90 825 if (s.ok()) {
11fdf7f2
TL
826 if (bytes_written == data.size()) {
827 // For sequential write this would be simple
828 // size extension by data.size()
829 uint64_t write_end = offset + bytes_written;
830 if (write_end >= next_write_offset_) {
831 next_write_offset_ = write_end;
832 }
833 } else {
1e59de90
TL
834 s = IOStatus::IOError("Failed to write all of the requested data: " +
835 file_data_->GetName());
7c673cae
FG
836 }
837 }
838 return s;
839}
840
1e59de90 841inline IOStatus WinWritableImpl::TruncateImpl(uint64_t size) {
11fdf7f2
TL
842 // It is tempting to check for the size for sector alignment
843 // but truncation may come at the end and there is not a requirement
844 // for this to be sector aligned so long as we do not attempt to write
845 // after that. The interface docs state that the behavior is undefined
846 // in that case.
1e59de90
TL
847 IOStatus s =
848 ftruncate(file_data_->GetName(), file_data_->GetFileHandle(), size);
11fdf7f2 849
7c673cae 850 if (s.ok()) {
11fdf7f2 851 next_write_offset_ = size;
7c673cae
FG
852 }
853 return s;
854}
855
1e59de90
TL
856inline IOStatus WinWritableImpl::CloseImpl() {
857 IOStatus s;
7c673cae
FG
858
859 auto hFile = file_data_->GetFileHandle();
860 assert(INVALID_HANDLE_VALUE != hFile);
861
11fdf7f2 862 if (!::FlushFileBuffers(hFile)) {
7c673cae 863 auto lastError = GetLastError();
1e59de90
TL
864 s = IOErrorFromWindowsError(
865 "FlushFileBuffers failed at Close() for: " + file_data_->GetName(),
866 lastError);
7c673cae
FG
867 }
868
1e59de90 869 if (!file_data_->CloseFile() && s.ok()) {
7c673cae 870 auto lastError = GetLastError();
1e59de90
TL
871 s = IOErrorFromWindowsError(
872 "CloseHandle failed for: " + file_data_->GetName(), lastError);
7c673cae
FG
873 }
874 return s;
875}
876
1e59de90
TL
877inline IOStatus WinWritableImpl::SyncImpl(const IOOptions& /*options*/,
878 IODebugContext* /*dbg*/) {
879 IOStatus s;
880 if (!::FlushFileBuffers(file_data_->GetFileHandle())) {
7c673cae
FG
881 auto lastError = GetLastError();
882 s = IOErrorFromWindowsError(
1e59de90
TL
883 "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(),
884 lastError);
7c673cae
FG
885 }
886 return s;
887}
888
1e59de90
TL
889inline IOStatus WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
890 IOStatus status;
891 TEST_KILL_RANDOM("WinWritableFile::Allocate");
7c673cae
FG
892
893 // Make sure that we reserve an aligned amount of space
894 // since the reservation block size is driven outside so we want
895 // to check if we are ok with reservation here
1e59de90
TL
896 size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len),
897 static_cast<size_t>(alignment_));
7c673cae
FG
898 // Nothing to do
899 if (spaceToReserve <= reservedsize_) {
900 return status;
901 }
902
903 IOSTATS_TIMER_GUARD(allocate_nanos);
904 status = PreallocateInternal(spaceToReserve);
905 if (status.ok()) {
906 reservedsize_ = spaceToReserve;
907 }
908 return status;
909}
910
7c673cae
FG
911////////////////////////////////////////////////////////////////////////////////
912/// WinWritableFile
913
914WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
915 size_t alignment, size_t /* capacity */,
1e59de90 916 const FileOptions& options)
7c673cae 917 : WinFileData(fname, hFile, options.use_direct_writes),
f67539c2 918 WinWritableImpl(this, alignment),
1e59de90 919 FSWritableFile(options) {
7c673cae
FG
920 assert(!options.use_mmap_writes);
921}
922
1e59de90 923WinWritableFile::~WinWritableFile() {}
7c673cae
FG
924
925// Indicates if the class makes use of direct I/O
1e59de90
TL
926bool WinWritableFile::use_direct_io() const {
927 return WinFileData::use_direct_io();
928}
7c673cae
FG
929
930size_t WinWritableFile::GetRequiredBufferAlignment() const {
1e59de90 931 return static_cast<size_t>(GetAlignment());
7c673cae
FG
932}
933
1e59de90
TL
934IOStatus WinWritableFile::Append(const Slice& data,
935 const IOOptions& /*options*/,
936 IODebugContext* /*dbg*/) {
7c673cae
FG
937 return AppendImpl(data);
938}
939
1e59de90
TL
940IOStatus WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
941 const IOOptions& /*options*/,
942 IODebugContext* /*dbg*/) {
7c673cae
FG
943 return PositionedAppendImpl(data, offset);
944}
945
946// Need to implement this so the file is truncated correctly
947// when buffered and unbuffered mode
1e59de90
TL
948IOStatus WinWritableFile::Truncate(uint64_t size, const IOOptions& /*options*/,
949 IODebugContext* /*dbg*/) {
7c673cae
FG
950 return TruncateImpl(size);
951}
952
1e59de90
TL
953IOStatus WinWritableFile::Close(const IOOptions& /*options*/,
954 IODebugContext* /*dbg*/) {
7c673cae
FG
955 return CloseImpl();
956}
957
1e59de90
TL
958// write out the cached data to the OS cache
959// This is now taken care of the WritableFileWriter
960IOStatus WinWritableFile::Flush(const IOOptions& /*options*/,
961 IODebugContext* /*dbg*/) {
962 return IOStatus::OK();
7c673cae
FG
963}
964
1e59de90
TL
965IOStatus WinWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) {
966 return SyncImpl(options, dbg);
7c673cae
FG
967}
968
1e59de90
TL
969IOStatus WinWritableFile::Fsync(const IOOptions& options, IODebugContext* dbg) {
970 return SyncImpl(options, dbg);
971}
7c673cae 972
11fdf7f2
TL
973bool WinWritableFile::IsSyncThreadSafe() const { return true; }
974
1e59de90
TL
975uint64_t WinWritableFile::GetFileSize(const IOOptions& /*options*/,
976 IODebugContext* /*dbg*/) {
11fdf7f2 977 return GetFileNextWriteOffset();
7c673cae
FG
978}
979
1e59de90
TL
980IOStatus WinWritableFile::Allocate(uint64_t offset, uint64_t len,
981 const IOOptions& /*options*/,
982 IODebugContext* /*dbg*/) {
7c673cae
FG
983 return AllocateImpl(offset, len);
984}
985
986size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
987 return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
988}
989
990/////////////////////////////////////////////////////////////////////////
991/// WinRandomRWFile
992
993WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
1e59de90 994 size_t alignment, const FileOptions& options)
7c673cae
FG
995 : WinFileData(fname, hFile,
996 options.use_direct_reads && options.use_direct_writes),
997 WinRandomAccessImpl(this, alignment, options),
998 WinWritableImpl(this, alignment) {}
999
1e59de90
TL
1000bool WinRandomRWFile::use_direct_io() const {
1001 return WinFileData::use_direct_io();
1002}
7c673cae
FG
1003
1004size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
1e59de90
TL
1005 assert(WinRandomAccessImpl::GetAlignment() ==
1006 WinWritableImpl::GetAlignment());
1007 return static_cast<size_t>(WinRandomAccessImpl::GetAlignment());
7c673cae
FG
1008}
1009
1e59de90
TL
1010IOStatus WinRandomRWFile::Write(uint64_t offset, const Slice& data,
1011 const IOOptions& /*options*/,
1012 IODebugContext* /*dbg*/) {
7c673cae
FG
1013 return PositionedAppendImpl(data, offset);
1014}
1015
1e59de90
TL
1016IOStatus WinRandomRWFile::Read(uint64_t offset, size_t n,
1017 const IOOptions& /*options*/, Slice* result,
1018 char* scratch, IODebugContext* /*dbg*/) const {
7c673cae
FG
1019 return ReadImpl(offset, n, result, scratch);
1020}
1021
1e59de90
TL
1022IOStatus WinRandomRWFile::Flush(const IOOptions& /*options*/,
1023 IODebugContext* /*dbg*/) {
1024 return IOStatus::OK();
7c673cae
FG
1025}
1026
1e59de90
TL
1027IOStatus WinRandomRWFile::Sync(const IOOptions& options, IODebugContext* dbg) {
1028 return SyncImpl(options, dbg);
7c673cae
FG
1029}
1030
1e59de90
TL
1031IOStatus WinRandomRWFile::Close(const IOOptions& /*options*/,
1032 IODebugContext* /*dbg*/) {
7c673cae
FG
1033 return CloseImpl();
1034}
1035
11fdf7f2
TL
1036//////////////////////////////////////////////////////////////////////////
1037/// WinMemoryMappedBufer
1038WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
f67539c2
TL
1039 BOOL ret
1040#if defined(_MSC_VER)
1e59de90 1041 = FALSE;
f67539c2 1042#else
1e59de90 1043 __attribute__((__unused__));
f67539c2 1044#endif
11fdf7f2
TL
1045 if (base_ != nullptr) {
1046 ret = ::UnmapViewOfFile(base_);
1047 assert(ret);
1048 base_ = nullptr;
1049 }
1050 if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
1051 ret = ::CloseHandle(map_handle_);
1052 assert(ret);
1053 map_handle_ = NULL;
1054 }
1055 if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
1056 ret = ::CloseHandle(file_handle_);
1057 assert(ret);
1058 file_handle_ = NULL;
1059 }
1060}
1061
7c673cae
FG
1062//////////////////////////////////////////////////////////////////////////
1063/// WinDirectory
1064
1e59de90
TL
1065IOStatus WinDirectory::Fsync(const IOOptions& /*options*/,
1066 IODebugContext* /*dbg*/) {
1067 return IOStatus::OK();
1068}
1069
1070IOStatus WinDirectory::Close(const IOOptions& /*options*/,
1071 IODebugContext* /*dbg*/) {
1072 IOStatus s = IOStatus::OK();
1073 BOOL ret __attribute__((__unused__));
1074 if (handle_ != INVALID_HANDLE_VALUE) {
1075 ret = ::CloseHandle(handle_);
1076 if (!ret) {
1077 auto lastError = GetLastError();
1078 s = IOErrorFromWindowsError("Directory closes failed for : " + GetName(),
1079 lastError);
1080 }
1081 handle_ = NULL;
1082 }
1083 return s;
1084}
7c673cae 1085
11fdf7f2
TL
1086size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
1087 return GetUniqueIdFromFile(handle_, id, max_size);
1088}
7c673cae
FG
1089//////////////////////////////////////////////////////////////////////////
1090/// WinFileLock
1091
1092WinFileLock::~WinFileLock() {
11fdf7f2
TL
1093 BOOL ret __attribute__((__unused__));
1094 ret = ::CloseHandle(hFile_);
7c673cae
FG
1095 assert(ret);
1096}
1097
1e59de90 1098} // namespace port
f67539c2 1099} // namespace ROCKSDB_NAMESPACE
20effc67
TL
1100
1101#endif