diff --git a/CMakeLists.txt b/CMakeLists.txt index d1ac000a09d..ef896f7af66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1141,12 +1141,12 @@ if (BUILD_TESTS OR BUILD_BENCHMARKS) if (${LIBAIO_FOUND}) folly_define_tests( - DIRECTORY experimental/io/test/ + DIRECTORY io/async/test/ TEST async_io_test SOURCES AsyncIOTest.cpp - AsyncBaseTestLib.cpp - IoTestTempFileUtil.cpp + ../../../experimental/io/test/AsyncBaseTestLib.cpp + ../../../experimental/io/test/IoTestTempFileUtil.cpp ) endif() diff --git a/folly/experimental/io/AsyncBase.h b/folly/experimental/io/AsyncBase.h index f72fd139e66..8398c5d565c 100644 --- a/folly/experimental/io/AsyncBase.h +++ b/folly/experimental/io/AsyncBase.h @@ -14,305 +14,4 @@ * limitations under the License. */ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace folly { -class AsyncIOOp; -class IoUringOp; -/** - * An AsyncBaseOp represents a pending operation. You may set a notification - * callback or you may use this class's methods directly. - * - * The op must remain allocated until it is completed or canceled. - */ -class AsyncBaseOp { - friend class AsyncBase; - - public: - using NotificationCallback = folly::Function; - - explicit AsyncBaseOp(NotificationCallback cb = NotificationCallback()); - AsyncBaseOp(const AsyncBaseOp&) = delete; - AsyncBaseOp& operator=(const AsyncBaseOp&) = delete; - virtual ~AsyncBaseOp(); - - enum class State { - UNINITIALIZED, - INITIALIZED, - PENDING, - COMPLETED, - CANCELED, - }; - - /** - * Initiate a read request. - */ - virtual void pread(int fd, void* buf, size_t size, off_t start) = 0; - void pread(int fd, Range range, off_t start) { - pread(fd, range.begin(), range.size(), start); - } - virtual void preadv(int fd, const iovec* iov, int iovcnt, off_t start) = 0; - virtual void pread( - int fd, void* buf, size_t size, off_t start, int /*buf_index*/) { - pread(fd, buf, size, start); - } - - /** - * Initiate a write request. - */ - virtual void pwrite(int fd, const void* buf, size_t size, off_t start) = 0; - void pwrite(int fd, Range range, off_t start) { - pwrite(fd, range.begin(), range.size(), start); - } - virtual void pwritev(int fd, const iovec* iov, int iovcnt, off_t start) = 0; - virtual void pwrite( - int fd, const void* buf, size_t size, off_t start, int /*buf_index*/) { - pwrite(fd, buf, size, start); - } - - // we support only these subclasses - virtual AsyncIOOp* getAsyncIOOp() = 0; - virtual IoUringOp* getIoUringOp() = 0; - - // ostream output - virtual void toStream(std::ostream& os) const = 0; - - /** - * Return the current operation state. - */ - State state() const { return state_; } - - /** - * user data get/set - */ - void* getUserData() const { return userData_; } - - void setUserData(void* userData) { userData_ = userData; } - - /** - * Reset the operation for reuse. It is an error to call reset() on - * an Op that is still pending. - */ - virtual void reset(NotificationCallback cb = NotificationCallback()) = 0; - - void setNotificationCallback(NotificationCallback cb) { cb_ = std::move(cb); } - - /** - * Get the notification callback from the op. - * - * Note that this moves the callback out, leaving the callback in an - * uninitialized state! You must call setNotificationCallback before - * submitting the operation! - */ - NotificationCallback getNotificationCallback() { return std::move(cb_); } - - /** - * Retrieve the result of this operation. Returns >=0 on success, - * -errno on failure (that is, using the Linux kernel error reporting - * conventions). Use checkKernelError (folly/Exception.h) on the result to - * throw a std::system_error in case of error instead. - * - * It is an error to call this if the Op hasn't completed. - */ - ssize_t result() const; - - // debug helper - static std::string fd2name(int fd); - - protected: - void init(); - void start(); - void unstart(); - void complete(ssize_t result); - void cancel(); - - NotificationCallback cb_; - std::atomic state_; - ssize_t result_; - void* userData_{nullptr}; -}; - -std::ostream& operator<<(std::ostream& os, const AsyncBaseOp& op); -std::ostream& operator<<(std::ostream& os, AsyncBaseOp::State state); - -/** - * Generic C++ interface around Linux IO(io_submit, io_uring) - */ -class AsyncBase { - public: - using Op = AsyncBaseOp; - - enum PollMode { - NOT_POLLABLE, - POLLABLE, - }; - - /** - * Create an AsyncBase context capable of holding at most 'capacity' pending - * requests at the same time. As requests complete, others can be scheduled, - * as long as this limit is not exceeded. - * - * If pollMode is POLLABLE, pollFd() will return a file descriptor that - * can be passed to poll / epoll / select and will become readable when - * any IOs on this AsyncBase have completed. If you do this, you must use - * pollCompleted() instead of wait() -- do not read from the pollFd() - * file descriptor directly. - * - * You may use the same AsyncBase object from multiple threads, as long as - * there is only one concurrent caller of wait() / pollCompleted() / cancel() - * (perhaps by always calling it from the same thread, or by providing - * appropriate mutual exclusion). In this case, pending() returns a snapshot - * of the current number of pending requests. - */ - explicit AsyncBase(size_t capacity, PollMode pollMode = NOT_POLLABLE); - AsyncBase(const AsyncBase&) = delete; - AsyncBase& operator=(const AsyncBase&) = delete; - virtual ~AsyncBase(); - - /** - * Initialize context - */ - virtual void initializeContext() = 0; - - /** - * Wait for at least minRequests to complete. Returns the requests that - * have completed; the returned range is valid until the next call to - * wait(). minRequests may be 0 to not block. - */ - Range wait(size_t minRequests); - - /** - * Cancel all pending requests and return them; the returned range is - * valid until the next call to cancel(). - */ - Range cancel(); - - /** - * Return the number of pending requests. - */ - size_t pending() const { return pending_; } - - /** - * Return the maximum number of requests that can be kept outstanding - * at any one time. - */ - size_t capacity() const { return capacity_; } - - /** - * Return the accumulative number of submitted I/O, since this object - * has been created. - */ - size_t totalSubmits() const { return submitted_; } - - /** - * If POLLABLE, return a file descriptor that can be passed to poll / epoll - * and will become readable when any async IO operations have completed. - * If NOT_POLLABLE, return -1. - */ - int pollFd() const { return pollFd_; } - - /** - * If POLLABLE, call instead of wait after the file descriptor returned - * by pollFd() became readable. The returned range is valid until the next - * call to pollCompleted(). - */ - Range pollCompleted(); - - /** - * Submit an op for execution. - */ - void submit(Op* op); - - /** - * Submit a range of ops for execution - */ - int submit(Range ops); - - protected: - virtual int drainPollFd() = 0; - void complete(Op* op, ssize_t result) { op->complete(result); } - - void cancel(Op* op) { op->cancel(); } - - bool isInit() const { return init_.load(std::memory_order_relaxed); } - - void decrementPending(size_t num = 1); - virtual int submitOne(AsyncBase::Op* op) = 0; - virtual int submitRange(Range ops) = 0; - - enum class WaitType { COMPLETE, CANCEL }; - virtual Range doWait( - WaitType type, - size_t minRequests, - size_t maxRequests, - std::vector& result) = 0; - - std::atomic init_{false}; - std::mutex initMutex_; - - std::atomic pending_{0}; - std::atomic submitted_{0}; - const size_t capacity_; - const PollMode pollMode_; - int pollFd_{-1}; - std::vector completed_; - std::vector canceled_; -}; - -/** - * Wrapper around AsyncBase that allows you to schedule more requests than - * the AsyncBase's object capacity. Other requests are queued and processed - * in a FIFO order. - */ -class AsyncBaseQueue { - public: - /** - * Create a queue, using the given AsyncBase object. - * The AsyncBase object may not be used by anything else until the - * queue is destroyed. - */ - explicit AsyncBaseQueue(AsyncBase* asyncBase); - ~AsyncBaseQueue(); - - size_t queued() const { return queue_.size(); } - - /** - * Submit an op to the AsyncBase queue. The op will be queued until - * the AsyncBase object has room. - */ - void submit(AsyncBaseOp* op); - - /** - * Submit a delayed op to the AsyncBase queue; this allows you to postpone - * creation of the Op (which may require allocating memory, etc) until - * the AsyncBase object has room. - */ - using OpFactory = std::function; - void submit(OpFactory op); - - private: - void onCompleted(AsyncBaseOp* op); - void maybeDequeue(); - - AsyncBase* asyncBase_; - - std::deque queue_; -}; - -} // namespace folly +#include diff --git a/folly/experimental/io/AsyncIO.h b/folly/experimental/io/AsyncIO.h index b1a6f52eeed..0e77376646a 100644 --- a/folly/experimental/io/AsyncIO.h +++ b/folly/experimental/io/AsyncIO.h @@ -14,88 +14,4 @@ * limitations under the License. */ -#pragma once - -#include - -#if __has_include() - -#include - -namespace folly { - -class AsyncIOOp : public AsyncBaseOp { - friend class AsyncIO; - friend std::ostream& operator<<(std::ostream& os, const AsyncIOOp& o); - - public: - explicit AsyncIOOp(NotificationCallback cb = NotificationCallback()); - AsyncIOOp(const AsyncIOOp&) = delete; - AsyncIOOp& operator=(const AsyncIOOp&) = delete; - ~AsyncIOOp() override; - - /** - * Initiate a read request. - */ - void pread(int fd, void* buf, size_t size, off_t start) override; - void preadv(int fd, const iovec* iov, int iovcnt, off_t start) override; - - /** - * Initiate a write request. - */ - void pwrite(int fd, const void* buf, size_t size, off_t start) override; - void pwritev(int fd, const iovec* iov, int iovcnt, off_t start) override; - - void reset(NotificationCallback cb = NotificationCallback()) override; - - AsyncIOOp* getAsyncIOOp() override { return this; } - - IoUringOp* getIoUringOp() override { return nullptr; } - - void toStream(std::ostream& os) const override; - - const iocb& getIocb() const { return iocb_; } - - private: - iocb iocb_; -}; - -std::ostream& operator<<(std::ostream& os, const AsyncIOOp& op); - -/** - * C++ interface around Linux Async IO. - */ -class AsyncIO : public AsyncBase { - public: - using Op = AsyncIOOp; - - /** - * Note: the maximum number of allowed concurrent requests is controlled - * by the fs.aio-max-nr sysctl, the default value is usually 64K. - */ - explicit AsyncIO(size_t capacity, PollMode pollMode = NOT_POLLABLE); - AsyncIO(const AsyncIO&) = delete; - AsyncIO& operator=(const AsyncIO&) = delete; - ~AsyncIO() override; - - void initializeContext() override; - - protected: - int drainPollFd() override; - int submitOne(AsyncBase::Op* op) override; - int submitRange(Range ops) override; - - private: - Range doWait( - WaitType type, - size_t minRequests, - size_t maxRequests, - std::vector& result) override; - - io_context_t ctx_{nullptr}; -}; - -using AsyncIOQueue = AsyncBaseQueue; -} // namespace folly - -#endif +#include diff --git a/folly/experimental/io/AsyncIoUringSocket.h b/folly/experimental/io/AsyncIoUringSocket.h index 63ad65d324f..9381210ff76 100644 --- a/folly/experimental/io/AsyncIoUringSocket.h +++ b/folly/experimental/io/AsyncIoUringSocket.h @@ -14,504 +14,4 @@ * limitations under the License. */ -#pragma once - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace folly { -class AsyncDetachFdCallback { - public: - virtual ~AsyncDetachFdCallback() = default; - virtual void fdDetached( - NetworkSocket ns, std::unique_ptr unread) noexcept = 0; - virtual void fdDetachFail(const AsyncSocketException& ex) noexcept = 0; -}; -} // namespace folly - -#if FOLLY_HAS_LIBURING -class IoUringBackend; - -namespace folly { - -class AsyncIoUringSocket : public AsyncSocketTransport { - public: - using Cert = folly::AsyncTransportCertificate; - struct Options { - Options() - : allocateNoBufferPoolBuffer(defaultAllocateNoBufferPoolBuffer), - multishotRecv(true) {} - - static std::unique_ptr defaultAllocateNoBufferPoolBuffer(); - folly::Function()> allocateNoBufferPoolBuffer; - folly::Optional zeroCopyEnable; - bool multishotRecv; - }; - - using UniquePtr = std::unique_ptr; - explicit AsyncIoUringSocket( - AsyncTransport::UniquePtr other, Options&& options = Options{}); - explicit AsyncIoUringSocket(AsyncSocket* sock, Options&& options = Options{}); - explicit AsyncIoUringSocket(EventBase* evb, Options&& options = Options{}); - explicit AsyncIoUringSocket( - EventBase* evb, NetworkSocket ns, Options&& options = Options{}); - - static bool supports(EventBase* backend); - - void connect( - AsyncSocket::ConnectCallback* callback, - const folly::SocketAddress& address, - std::chrono::milliseconds timeout = std::chrono::milliseconds(0), - SocketOptionMap const& options = emptySocketOptionMap, - const SocketAddress& bindAddr = anyAddress(), - const std::string& ifName = std::string()) noexcept; - - void connect( - ConnectCallback* callback, - const folly::SocketAddress& address, - int timeout, - SocketOptionMap const& options, - const SocketAddress& bindAddr, - const std::string& ifName) noexcept override { - connect( - callback, - address, - std::chrono::milliseconds(timeout), - options, - bindAddr, - ifName); - } - - std::chrono::nanoseconds getConnectTime() const { - return connectEndTime_ - connectStartTime_; - } - - // AsyncSocketBase - EventBase* getEventBase() const override { return evb_; } - - // AsyncReader - void setReadCB(ReadCallback* callback) override; - - ReadCallback* getReadCallback() const override { - return readSqe_->readCallback(); - } - std::unique_ptr takePreReceivedData() override { - return readSqe_->takePreReceivedData(); - } - - // AsyncWriter - void write(WriteCallback*, const void*, size_t, WriteFlags = WriteFlags::NONE) - override; - void writev( - WriteCallback*, - const iovec*, - size_t, - WriteFlags = WriteFlags::NONE) override; - void writeChain( - WriteCallback* callback, - std::unique_ptr&& buf, - WriteFlags flags) override; - bool canZC(std::unique_ptr const& buf) const; - - // AsyncTransport - void close() override; - void closeNow() override; - void closeWithReset() override; - void shutdownWrite() override; - void shutdownWriteNow() override; - - bool good() const override; - bool readable() const override { return good(); } - bool error() const override; - bool hangup() const override; - - bool connecting() const override { - return connectSqe_ && connectSqe_->inFlight(); - } - - void attachEventBase(EventBase*) override; - void detachEventBase() override; - bool isDetachable() const override; - - uint32_t getSendTimeout() const override { - return static_cast( - std::chrono::duration_cast(writeTimeoutTime_) - .count()); - } - - void setSendTimeout(uint32_t ms) override; - - void getLocalAddress(SocketAddress* address) const override; - - void getPeerAddress(SocketAddress*) const override; - - void setPreReceivedData(std::unique_ptr data) override; - void cacheAddresses() override; - - /** - * @return True iff end of record tracking is enabled - */ - bool isEorTrackingEnabled() const override { return false; } - - void setEorTracking(bool) override { - // don't support this. - // as far as I can see this is only used by AsyncSSLSocket, but TLS1.3 - // supercedes this so I think we can ignore it. - throw std::runtime_error( - "AsyncIoUringSocket::setEorTracking not supported"); - } - - size_t getAppBytesWritten() const override { return getRawBytesWritten(); } - size_t getRawBytesWritten() const override { return bytesWritten_; } - size_t getAppBytesReceived() const override { return getRawBytesReceived(); } - size_t getRawBytesReceived() const override; - - const AsyncTransport* getWrappedTransport() const override { return nullptr; } - - // AsyncSocketTransport - int setNoDelay(bool noDelay) override; - int setSockOpt( - int level, int optname, const void* optval, socklen_t optsize) override; - - std::string getSecurityProtocol() const override { return securityProtocol_; } - std::string getApplicationProtocol() const noexcept override { - return applicationProtocol_; - } - NetworkSocket getNetworkSocket() const override { return fd_; } - - void setSecurityProtocol(std::string s) { securityProtocol_ = std::move(s); } - void setApplicationProtocol(std::string s) { - applicationProtocol_ = std::move(s); - } - - const folly::AsyncTransportCertificate* getPeerCertificate() const override { - return peerCert_.get(); - } - - const folly::AsyncTransportCertificate* getSelfCertificate() const override { - return selfCert_.get(); - } - - void dropPeerCertificate() noexcept override { peerCert_.reset(); } - - void dropSelfCertificate() noexcept override { selfCert_.reset(); } - - void setPeerCertificate(const std::shared_ptr& peerCert) { - peerCert_ = peerCert; - } - - void setSelfCertificate(const std::shared_ptr& selfCert) { - selfCert_ = selfCert; - } - - void asyncDetachFd(AsyncDetachFdCallback* callback); - bool readSqeInFlight() const { return readSqe_->inFlight(); } - bool getTFOSucceded() const override; - void enableTFO() override { - // No-op if folly does not allow tfo -#if FOLLY_ALLOW_TFO - VLOG(5) << "AsyncIoUringSocket::enableTFO()"; - enableTFO_ = true; -#endif - } - - void appendPreReceive(std::unique_ptr iobuf) noexcept; - - protected: - ~AsyncIoUringSocket() override; - - private: - friend class ReadSqe; - friend class WriteSqe; - void setFd(NetworkSocket ns); - void registerFd(); - void unregisterFd(); - void readProcessSubmit( - struct io_uring_sqe* sqe, - IoUringBufferProviderBase* bufferProvider, - size_t* maxSize, - IoUringBufferProviderBase* usedBufferProvider) noexcept; - void readCallback( - int res, - uint32_t flags, - size_t maxSize, - IoUringBufferProviderBase* bufferProvider) noexcept; - void allowReads(); - void previousReadDone(); - void processWriteQueue() noexcept; - void setStateEstablished(); - void writeDone() noexcept; - void doSubmitWrite() noexcept; - void doReSubmitWrite() noexcept; - void failAllWrites() noexcept; - void submitRead(bool now = false); - void processConnectSubmit( - struct io_uring_sqe* sqe, sockaddr_storage& storage); - void processConnectResult(const io_uring_cqe* cqe); - void processConnectTimeout(); - void processFastOpenResult(const io_uring_cqe* cqe) noexcept; - void startSendTimeout(); - void sendTimeoutExpired(); - void failWrite(const AsyncSocketException& ex); - void readEOF(); - void readError(); - NetworkSocket takeFd(); - bool setZeroCopy(bool enable) override; - bool getZeroCopy() const override; - void setZeroCopyEnableFunc(AsyncWriter::ZeroCopyEnableFunc func) override; - - enum class State { - None, - Connecting, - Established, - Closed, - Error, - FastOpen, - }; - - static std::string toString(State s); - std::string stateAsString() const { return toString(state_); } - - struct ReadSqe : IoSqeBase, DelayedDestruction { - using UniquePtr = std::unique_ptr; - explicit ReadSqe(AsyncIoUringSocket* parent); - void processSubmit(struct io_uring_sqe* sqe) noexcept override; - void callback(const io_uring_cqe* cqe) noexcept override; - void callbackCancelled(const io_uring_cqe* cqe) noexcept override; - - void setReadCallback(ReadCallback* callback, bool submitNow); - ReadCallback* readCallback() const { return readCallback_; } - - size_t bytesReceived() const { return bytesReceived_; } - - std::unique_ptr takePreReceivedData(); - void appendPreReceive(std::unique_ptr data) noexcept { - appendReadData(std::move(data), preReceivedData_); - } - - void destroy() override { - parent_ = nullptr; - DelayedDestruction::destroy(); - } - - bool waitingForOldEventBaseRead() const; - void setOldEventBaseRead(folly::SemiFuture>&& f) { - oldEventBaseRead_ = std::move(f); - } - void attachEventBase(); - folly::Optional>> - detachEventBase(); - - private: - ~ReadSqe() override = default; - void appendReadData( - std::unique_ptr data, std::unique_ptr& overflow) noexcept; - void sendReadBuf( - std::unique_ptr buf, std::unique_ptr& overflow) noexcept; - bool readCallbackUseIoBufs() const; - void invalidState(ReadCallback* callback); - void processOldEventBaseRead(); - - IoUringBufferProviderBase* lastUsedBufferProvider_; - ReadCallback* readCallback_ = nullptr; - AsyncIoUringSocket* parent_; - size_t maxSize_; - uint64_t setReadCbCount_{0}; - size_t bytesReceived_{0}; - - std::unique_ptr queuedReceivedData_; - std::unique_ptr preReceivedData_; - std::unique_ptr tmpBuffer_; - bool supportsMultishotRecv_ = - false; // todo: this can be per process instead of per socket - - folly::Optional>> - oldEventBaseRead_; - std::shared_ptr alive_; - }; - - struct CloseSqe : IoSqeBase { - explicit CloseSqe(AsyncIoUringSocket* parent) - : IoSqeBase(IoSqeBase::Type::Close), parent_(parent) {} - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - parent_->closeProcessSubmit(sqe); - } - void callback(const io_uring_cqe*) noexcept override { delete this; } - void callbackCancelled(const io_uring_cqe*) noexcept override { - delete this; - } - AsyncIoUringSocket* parent_; - }; - - struct write_sqe_tag; - using write_sqe_hook = - boost::intrusive::list_base_hook>; - struct WriteSqe final : IoSqeBase, public write_sqe_hook { - explicit WriteSqe( - AsyncIoUringSocket* parent, - WriteCallback* callback, - std::unique_ptr&& buf, - WriteFlags flags, - bool zc); - ~WriteSqe() override { VLOG(5) << "~WriteSqe() " << this; } - - void processSubmit(struct io_uring_sqe* sqe) noexcept override; - void callback(const io_uring_cqe* cqe) noexcept override; - void callbackCancelled(const io_uring_cqe* cqe) noexcept override; - int sendMsgFlags() const; - std::pair< - folly::SemiFuture>>, - WriteSqe*> - detachEventBase(); - - boost::intrusive::list_member_hook<> member_hook_; - AsyncIoUringSocket* parent_; - WriteCallback* callback_; - std::unique_ptr buf_; - WriteFlags flags_; - static constexpr size_t kSmallIoVecSize = 16; - small_vector iov_; - size_t totalLength_; - struct msghdr msg_; - - bool zerocopy_{false}; - int refs_ = 1; - folly::Function detachedSignal_; - }; - using WriteSqeList = boost::intrusive::list< - WriteSqe, - boost::intrusive::base_hook, - boost::intrusive::constant_time_size>; - - class WriteTimeout : public AsyncTimeout { - public: - explicit WriteTimeout(AsyncIoUringSocket* socket) - : AsyncTimeout(socket->evb_), socket_(socket) {} - - void timeoutExpired() noexcept override { socket_->sendTimeoutExpired(); } - - private: - AsyncIoUringSocket* socket_; - }; - - struct ConnectSqe : IoSqeBase, AsyncTimeout { - explicit ConnectSqe(AsyncIoUringSocket* parent) - : IoSqeBase(IoSqeBase::Type::Connect), - AsyncTimeout(parent->evb_), - parent_(parent) {} - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - parent_->processConnectSubmit(sqe, addrStorage); - } - void callback(const io_uring_cqe* cqe) noexcept override { - parent_->processConnectResult(cqe); - } - void callbackCancelled(const io_uring_cqe*) noexcept override { - delete this; - } - void timeoutExpired() noexcept override { - if (!cancelled()) { - parent_->processConnectTimeout(); - } - } - AsyncIoUringSocket* parent_; - sockaddr_storage addrStorage; - }; - - struct FastOpenSqe : IoSqeBase { - explicit FastOpenSqe( - AsyncIoUringSocket* parent, - SocketAddress const& addr, - std::unique_ptr initialWrite); - void processSubmit(struct io_uring_sqe* sqe) noexcept override; - void cleanupMsg() noexcept; - void callback(const io_uring_cqe* cqe) noexcept override { - cleanupMsg(); - parent_->processFastOpenResult(cqe); - } - void callbackCancelled(const io_uring_cqe*) noexcept override { - delete this; - } - - AsyncIoUringSocket* parent_; - std::unique_ptr initialWrite; - size_t addrLen_; - sockaddr_storage addrStorage; - }; - - EventBase* evb_ = nullptr; - NetworkSocket fd_; - IoUringBackend* backend_ = nullptr; - Options options_; - mutable SocketAddress localAddress_; - mutable SocketAddress peerAddress_; - IoUringFdRegistrationRecord* fdRegistered_ = nullptr; - int usedFd_ = -1; - unsigned int mbFixedFileFlags_ = 0; - std::unique_ptr closeSqe_{new CloseSqe(this)}; - - State state_ = State::None; - - // read - friend struct DetachFdState; - ReadSqe::UniquePtr readSqe_; - - // write - std::chrono::milliseconds writeTimeoutTime_{0}; - WriteTimeout writeTimeout_{this}; - WriteSqe* writeSqeActive_ = nullptr; - WriteSqeList writeSqeQueue_; - size_t bytesWritten_{0}; - - // connect - std::unique_ptr connectSqe_; - AsyncSocket::ConnectCallback* connectCallback_; - std::chrono::milliseconds connectTimeout_{0}; - std::chrono::steady_clock::time_point connectStartTime_; - std::chrono::steady_clock::time_point connectEndTime_; - - // stopTLS helpers: - std::string securityProtocol_; - std::string applicationProtocol_; - - std::shared_ptr selfCert_; - std::shared_ptr peerCert_; - - // shutdown: - int shutdownFlags_ = 0; - - // TCP fast open - std::unique_ptr fastOpenSqe_; - bool enableTFO_ = false; - - // detach event base - bool isDetaching_ = false; - Optional>>> - detachedWriteResult_; - std::shared_ptr alive_; - - void closeProcessSubmit(struct io_uring_sqe* sqe); -}; -} // namespace folly - -#endif +#include diff --git a/folly/experimental/io/AsyncIoUringSocketFactory.h b/folly/experimental/io/AsyncIoUringSocketFactory.h index de40cad8218..e6748b2f2c5 100644 --- a/folly/experimental/io/AsyncIoUringSocketFactory.h +++ b/folly/experimental/io/AsyncIoUringSocketFactory.h @@ -14,46 +14,4 @@ * limitations under the License. */ -#pragma once - -#include -#include - -namespace folly { - -class AsyncIoUringSocketFactory { - public: - static bool supports([[maybe_unused]] folly::EventBase* eb) { -#if FOLLY_HAS_LIBURING - return AsyncIoUringSocket::supports(eb); -#else - return false; -#endif - } - - template - static TWrapper create([[maybe_unused]] Args&&... args) { -#if FOLLY_HAS_LIBURING - return TWrapper(new AsyncIoUringSocket(std::forward(args)...)); -#else - throw std::runtime_error("AsyncIoUringSocket not supported"); -#endif - } - - static bool asyncDetachFd( - [[maybe_unused]] AsyncTransport& transport, - [[maybe_unused]] AsyncDetachFdCallback* callback) { -#if FOLLY_HAS_LIBURING - AsyncIoUringSocket* socket = - transport.getUnderlyingTransport(); - if (socket) { - socket->asyncDetachFd(callback); - return true; - } -#endif - - return false; - } -}; - -} // namespace folly +#include diff --git a/folly/experimental/io/BUCK b/folly/experimental/io/BUCK index 26e547d7824..b19d4d3d6d9 100644 --- a/folly/experimental/io/BUCK +++ b/folly/experimental/io/BUCK @@ -4,124 +4,52 @@ oncall("fbcode_entropy_wardens_folly") cpp_library( name = "async_base", - srcs = ["AsyncBase.cpp"], - headers = ["AsyncBase.h"], - deps = [ - "//folly:exception", - "//folly:format", - "//folly:likely", - "//folly:string", - "//folly/portability:filesystem", - "//folly/portability:unistd", + headers = [ + "AsyncBase.h", ], exported_deps = [ - "//folly:function", - "//folly:portability", - "//folly:range", - "//folly/portability:sys_uio", - ], - external_deps = [ - "boost", - "glog", + "//folly/io/async:async_base_class", ], ) cpp_library( name = "async_io", - srcs = ["AsyncIO.cpp"], - headers = ["AsyncIO.h"], - deps = [ - "fbsource//third-party/fmt:fmt", - "//folly:exception", - "//folly:likely", - "//folly:small_vector", - "//folly:string", - "//folly/portability:unistd", + headers = [ + "AsyncIO.h", ], exported_deps = [ - ":async_base", - ], - external_deps = [ - "boost", - "glog", - ], - exported_external_deps = [ - ("libaio", None, "aio"), + "//folly/io/async:async_io", ], ) cpp_library( - # @autodeps-skip name = "liburing", - headers = ["Liburing.h"], - os_deps = [( - "linux", - select({ - "DEFAULT": ["fbsource//third-party/liburing:uring"], - "ovr_config//os:linux-sgx": [], - }), - )], + headers = [ + "Liburing.h", + ], + exported_deps = [ + "//folly/io/async:liburing", + ], ) cpp_library( name = "async_io_uring_socket", - srcs = [ - "AsyncIoUringSocket.cpp", - ], headers = [ "AsyncIoUringSocket.h", "AsyncIoUringSocketFactory.h", ], - deps = [ - ":io_uring_event_base_local", - "//folly:conv", - "//folly/detail:socket_fast_open", - "//folly/memory:malloc", - "//folly/portability:sys_uio", - ], exported_deps = [ - ":io_uring_backend", - ":liburing", - "//folly:network_address", - "//folly:optional", - "//folly:small_vector", - "//folly/futures:core", - "//folly/io:iobuf", - "//folly/io:socket_option_map", - "//folly/io/async:async_base", - "//folly/io/async:async_socket", - "//folly/io/async:async_socket_exception", - "//folly/io/async:async_transport", - "//folly/io/async:delayed_destruction", - "//folly/net:net_ops_dispatcher", - "//folly/portability:sockets", - ], - exported_external_deps = [ - "boost", + "//folly/io/async:async_io_uring_socket", ], ) cpp_library( name = "simple_async_io", - srcs = ["SimpleAsyncIO.cpp"], - headers = ["SimpleAsyncIO.h"], - deps = [ - ":async_io", - ":io_uring", - ":liburing", - "//folly:string", - "//folly/experimental/coro:baton", - "//folly/portability:sockets", + headers = [ + "SimpleAsyncIO.h", ], exported_deps = [ - ":async_base", - "//folly:synchronized", - "//folly/executors:global_executor", - "//folly/experimental/coro:task", - "//folly/io/async:async_base", - "//folly/io/async:scoped_event_base_thread", - ], - exported_external_deps = [ + "//folly/io/async:simple_async_io", ], ) @@ -130,213 +58,93 @@ cpp_library( headers = [ "Epoll.h", ], + exported_deps = [ + "//folly/io/async:epoll", + ], ) cpp_library( + # @autodeps-skip name = "epoll_backend", - srcs = [ - "EpollBackend.cpp", - ], headers = [ "Epoll.h", "EpollBackend.h", ], - modular_headers = False, - deps = [ - "//folly:file_util", - "//folly:intrusive_list", - "//folly:map_util", - "//folly:string", - ], exported_deps = [ - "//folly/container:intrusive_heap", - "//folly/io/async:async_base", + "//folly/io/async:epoll_backend", ], ) cpp_library( name = "event_base_poller", - srcs = ["EventBasePoller.cpp"], - headers = ["EventBasePoller.h"], - deps = [ - "fbsource//third-party/fmt:fmt", - ":epoll", - ":liburing", - "//folly:file_util", - "//folly:string", - "//folly/lang:align", - "//folly/portability:gflags", - "//folly/synchronization:baton", - "//folly/system:thread_name", + headers = [ + "EventBasePoller.h", ], exported_deps = [ - "//folly:function", - "//folly:range", - "//folly:synchronized", - ], - external_deps = [ - "boost", - "glog", + "//folly/io/async:event_base_poller", ], ) cpp_library( name = "mux_io_thread_pool_executor", - srcs = ["MuxIOThreadPoolExecutor.cpp"], - headers = ["MuxIOThreadPoolExecutor.h"], - deps = [ - "fbsource//third-party/fmt:fmt", - ":epoll_backend", - "//folly/container:enumerate", - "//folly/lang:align", - "//folly/synchronization:latch", + headers = [ + "MuxIOThreadPoolExecutor.h", ], exported_deps = [ - ":event_base_poller", - "//folly:portability", - "//folly/concurrency:unbounded_queue", - "//folly/executors:io_thread_pool_executor", - "//folly/executors:queue_observer", - "//folly/io/async:event_base_manager", - "//folly/synchronization:baton", - "//folly/synchronization:relaxed_atomic", - "//folly/synchronization:throttled_lifo_sem", - "//folly/synchronization:wait_options", + "//folly/io/async:mux_io_thread_pool_executor", ], ) cpp_library( name = "io_uring", - srcs = ["IoUring.cpp"], - headers = ["IoUring.h"], - modular_headers = False, - deps = [ - "fbsource//third-party/fmt:fmt", - "//folly:exception", - "//folly:likely", - "//folly:string", - "//folly/portability:unistd", + headers = [ + "IoUring.h", ], exported_deps = [ - ":async_base", - ":liburing", - "//folly:shared_mutex", - ], - external_deps = [ - "boost", - "glog", + "//folly/io/async:io_uring", ], ) cpp_library( name = "io_uring_backend", - srcs = [ - "IoUringBackend.cpp", - ], headers = [ "IoUringBackend.h", "IoUringBase.h", ], - modular_headers = False, - deps = [ - "//folly:demangle", - "//folly:file_util", - "//folly:glog", - "//folly:likely", - "//folly:spin_lock", - "//folly:string", - "//folly/container:f14_hash", - "//folly/experimental/io:io_uring_provided_buffer_ring", - "//folly/lang:bits", - "//folly/portability:gflags", - "//folly/portability:sockets", - "//folly/portability:sys_mman", - "//folly/portability:sys_syscall", - "//folly/synchronization:call_once", - ], exported_deps = [ - ":liburing", - "//folly:c_portability", - "//folly:conv", - "//folly:cpp_attributes", - "//folly:exception_string", - "//folly:function", - "//folly:optional", - "//folly:range", - "//folly:small_vector", - "//folly/io:iobuf", - "//folly/io/async:async_base", - "//folly/io/async:delayed_destruction", - "//folly/portability:asm", - ], - exported_external_deps = [ - "boost", - "glog", + "//folly/io/async:io_uring_backend", ], ) cpp_library( + # @autodeps-skip name = "io_uring_provided_buffer_ring", - srcs = [ - "IoUringProvidedBufferRing.cpp", - ], headers = [ "IoUringBase.h", "IoUringProvidedBufferRing.h", ], - modular_headers = False, - deps = [ - "//folly:conv", - "//folly:exception_string", - "//folly:string", - ], exported_deps = [ - ":liburing", - "//folly/io:iobuf", - "//folly/io/async:delayed_destruction", - "//folly/portability:sys_mman", - ], - exported_external_deps = [ - "boost", + "//folly/io/async:io_uring_provided_buffer_ring", ], ) cpp_library( name = "io_uring_event", - srcs = [ - "IoUringEvent.cpp", - ], headers = [ "IoUringEvent.h", ], - modular_headers = False, exported_deps = [ - ":io_uring_backend", - ":liburing", - "//folly:file", - "//folly/io/async:async_base", + "//folly/io/async:io_uring_event", ], ) cpp_library( name = "io_uring_event_base_local", - srcs = [ - "IoUringEventBaseLocal.cpp", - ], headers = [ "IoUringEventBaseLocal.h", ], - modular_headers = False, - deps = [ - ":io_uring_event", - "//folly:singleton", - ], exported_deps = [ - ":io_uring_backend", - ":liburing", - "//folly/io/async:async_base", - ], - exported_external_deps = [ + "//folly/io/async:io_uring_event_base_local", ], ) diff --git a/folly/experimental/io/Epoll.h b/folly/experimental/io/Epoll.h index 2cb342b6a0d..dd5fc9728e2 100644 --- a/folly/experimental/io/Epoll.h +++ b/folly/experimental/io/Epoll.h @@ -14,10 +14,4 @@ * limitations under the License. */ -#pragma once - -#if defined(__linux__) && __has_include() -#define FOLLY_HAS_EPOLL 1 -#else -#define FOLLY_HAS_EPOLL 0 -#endif +#include diff --git a/folly/experimental/io/EpollBackend.h b/folly/experimental/io/EpollBackend.h index 37fa493ecd3..dd4a1a221bd 100644 --- a/folly/experimental/io/EpollBackend.h +++ b/folly/experimental/io/EpollBackend.h @@ -14,101 +14,4 @@ * limitations under the License. */ -#pragma once - -#include - -#if FOLLY_HAS_EPOLL - -#include -#include -#include -#include -#include - -#include -#include - -namespace folly { - -class EpollBackend : public EventBaseBackendBase { - public: - struct Options { - size_t numLoopEvents{128}; - - Options& setNumLoopEvents(size_t val) { - numLoopEvents = val; - return *this; - } - }; - - explicit EpollBackend(Options options); - ~EpollBackend() override; - - int getEpollFd() const { return epollFd_; } - - int getPollableFd() const override { return epollFd_; } - - event_base* getEventBase() override { return nullptr; } - - // Returns a non-standard value 2 when called with EVLOOP_NONBLOCK and the - // loop would block if called in a blocking fashion. - int eb_event_base_loop(int flags) override; - int eb_event_base_loopbreak() override; - - int eb_event_add(Event& event, const struct timeval* timeout) override; - int eb_event_del(Event& event) override; - - bool eb_event_active(Event&, int) override { return false; } - - bool setEdgeTriggered(Event& event) override; - - private: - struct TimerInfo; - - class SocketPair { - public: - SocketPair(); - - SocketPair(const SocketPair&) = delete; - SocketPair& operator=(const SocketPair&) = delete; - - ~SocketPair(); - - int readFd() const { return fds_[1]; } - - int writeFd() const { return fds_[0]; } - - private: - std::array fds_{{-1, -1}}; - }; - - void updateTimerFd(); - void addTimerEvent(Event& event, const struct timeval* timeout); - int removeTimerEvent(Event& event); - void processTimers(); - void setProcessTimers(); - - void addSignalEvent(Event& event); - int removeSignalEvent(Event& event); - void processSignals(); - - const Options options_; - - int epollFd_{-1}; - - size_t numInsertedEvents_{0}; - size_t numInternalEvents_{0}; - - bool loopBreak_{false}; - std::vector events_; // Cache allocation. - - int timerFd_{-1}; - std::optional timerFdExpiration_; - IntrusiveHeap timers_; - - SocketPair signalFds_; - std::map> signals_; -}; -} // namespace folly -#endif +#include diff --git a/folly/experimental/io/EventBasePoller.h b/folly/experimental/io/EventBasePoller.h index 2359fc399b2..742a9fbf7c7 100644 --- a/folly/experimental/io/EventBasePoller.h +++ b/folly/experimental/io/EventBasePoller.h @@ -14,106 +14,4 @@ * limitations under the License. */ -#pragma once - -#include -#include - -#include -#include -#include - -namespace folly::detail { - -/** - * EventBasePoller centralizes the blocking wait for events across multiple - * EventBases in a process. The singleton calls the provided ReadyCallback on - * ready EventBases, so they can be driven without blocking. This enables - * control over which threads drive the EventBases, as opposed to the standard - * blocking loop that requires one thread per EventBase. - * - * EventBases' pollable fds are registered in groups, so that the callback can - * batch processing of ready EventBases that belong to the same group. - * - * When the EventBase is ready it can be driven until it would block again, and - * then handoff() must be called to resume polling the fd. Neither the driving - * of the EventBase or the call to handoff() should happen inline in the - * callback, but delegated to another thread without blocking; the callback must - * return control quickly, as it executes in the main polling loop and can slow - * down the handling of all other registered EventBases. - * - * Note that none of the implementation is specific to EventBases, in fact this - * is a lightweight implementation of an event loop specialized on polling read - * events, and which supports grouping of the fds for batch-handling. The class - * could be easily generalized if other applications arise. - */ -class EventBasePoller { - public: - struct Stats { - using Duration = std::chrono::steady_clock::duration; - - // Track number of loop wake-ups and number of events returned. - int minNumEvents{std::numeric_limits::max()}; - int maxNumEvents{std::numeric_limits::min()}; - size_t totalNumEvents{0}; - size_t totalWakeups{0}; - - Duration totalWait{0}; - Duration minWait{Duration::max()}; - Duration maxWait{Duration::min()}; - - Duration totalBusy{0}; - Duration minBusy{Duration::max()}; - Duration maxBusy{Duration::min()}; - - void update(int numEvents, Duration wait, Duration busy); - }; - - class Handle { - public: - virtual ~Handle(); - - template - T* getUserData() const { - return reinterpret_cast(userData_); - } - - // If done is set to true, the handle is not re-armed and can be reclaimed - // with reclaim(). - virtual void handoff(bool done) = 0; - - protected: - friend class EventBasePoller; - - explicit Handle(void* userData) : userData_(userData) {} - - void* userData_; - }; - - // FdGroup method invocations must be serialized. - class FdGroup { - public: - virtual ~FdGroup(); - - // All added handles must be reclaimed before the group is destroyed. - virtual std::unique_ptr add(int fd, void* userData) = 0; - // Blocks until handoff(true) is called on the handle. - virtual void reclaim(std::unique_ptr handle) = 0; - }; - - using ReadyCallback = - Function readyHandles) const noexcept>; - - static EventBasePoller& get(); - - virtual ~EventBasePoller(); - - virtual std::unique_ptr makeFdGroup(ReadyCallback readyCallback) = 0; - - Stats getStats() { return stats_.copy(); } - - protected: - folly::Synchronized stats_; -}; - -} // namespace folly::detail +#include diff --git a/folly/experimental/io/IoUring.h b/folly/experimental/io/IoUring.h index 9e1d1592587..1aee31a72a4 100644 --- a/folly/experimental/io/IoUring.h +++ b/folly/experimental/io/IoUring.h @@ -14,169 +14,4 @@ * limitations under the License. */ -#pragma once - -#include -#include -#include - -#if FOLLY_HAS_LIBURING - -#include // @manual - -namespace folly { - -/** - * An IoUringOp represents a pending operation. You may set a notification - * callback or you may use this class's methods directly. - * - * The op must remain allocated until it is completed or canceled. - */ -class IoUringOp : public AsyncBaseOp { - friend class IoUring; - friend std::ostream& operator<<(std::ostream& stream, const IoUringOp& o); - - public: - struct Options { - Options() : sqe128(false), cqe32(false) {} - bool sqe128; - bool cqe32; - - bool operator==(const Options& options) const { - return sqe128 == options.sqe128 && cqe32 == options.cqe32; - } - - bool operator!=(const Options& options) const { - return !operator==(options); - } - }; - - IoUringOp( - NotificationCallback cb = NotificationCallback(), - Options options = Options()); - IoUringOp(const IoUringOp&) = delete; - IoUringOp& operator=(const IoUringOp&) = delete; - ~IoUringOp() override; - - /** - * Initiate a read request. - */ - void pread(int fd, void* buf, size_t size, off_t start) override; - void preadv(int fd, const iovec* iov, int iovcnt, off_t start) override; - void pread( - int fd, void* buf, size_t size, off_t start, int buf_index) override; - - /** - * Initiate a write request. - */ - void pwrite(int fd, const void* buf, size_t size, off_t start) override; - void pwritev(int fd, const iovec* iov, int iovcnt, off_t start) override; - void pwrite(int fd, const void* buf, size_t size, off_t start, int buf_index) - override; - - void reset(NotificationCallback cb = NotificationCallback()) override; - - AsyncIOOp* getAsyncIOOp() override { return nullptr; } - - IoUringOp* getIoUringOp() override { return this; } - - void toStream(std::ostream& os) const override; - - void initBase() { init(); } - - struct io_uring_sqe& getSqe() { return sqe_.sqe; } - - size_t getSqeSize() const { - return options_.sqe128 ? 128 : sizeof(struct io_uring_sqe); - } - - const struct io_uring_cqe& getCqe() const { - return *reinterpret_cast(&cqe_); - } - - size_t getCqeSize() const { - return options_.cqe32 ? 32 : sizeof(struct io_uring_cqe); - } - - void setCqe(const struct io_uring_cqe* cqe) { - ::memcpy(&cqe_, cqe, getCqeSize()); - } - - const Options& getOptions() const { return options_; } - - private: - Options options_; - - // we use unions with the largest size to avoid - // indidual allocations for the sqe/cqe - union { - struct io_uring_sqe sqe; - uint8_t data[128]; - } sqe_; - - // we have to use a union here because of -Wgnu-variable-sized-type-not-at-end - //__u64 big_cqe[]; - union { - __u64 user_data; // first member from from io_uring_cqe - uint8_t data[32]; - } cqe_; - - struct iovec iov_[1]; -}; - -std::ostream& operator<<(std::ostream& stream, const IoUringOp& op); - -/** - * C++ interface around Linux io_uring - */ -class IoUring : public AsyncBase { - public: - using Op = IoUringOp; - - /** - * Note: the maximum number of allowed concurrent requests is controlled - * by the kernel IORING_MAX_ENTRIES and the memlock limit, - * The default IORING_MAX_ENTRIES value is usually 32K. - */ - explicit IoUring( - size_t capacity, - PollMode pollMode = NOT_POLLABLE, - size_t maxSubmit = 1, - IoUringOp::Options options = IoUringOp::Options()); - IoUring(const IoUring&) = delete; - IoUring& operator=(const IoUring&) = delete; - ~IoUring() override; - - static bool isAvailable(); - - const IoUringOp::Options& getOptions() const { return options_; } - - int register_buffers(const struct iovec* iovecs, unsigned int nr_iovecs); - - int unregister_buffers(); - - void initializeContext() override; - - protected: - int drainPollFd() override; - int submitOne(AsyncBase::Op* op) override; - int submitRange(Range ops) override; - - private: - Range doWait( - WaitType type, - size_t minRequests, - size_t maxRequests, - std::vector& result) override; - - size_t maxSubmit_; - IoUringOp::Options options_; - struct io_uring_params params_; - struct io_uring ioRing_; - mutable SharedMutex submitMutex_; -}; - -using IoUringQueue = AsyncBaseQueue; -} // namespace folly - -#endif +#include diff --git a/folly/experimental/io/IoUringBackend.h b/folly/experimental/io/IoUringBackend.h index c59293ebf81..20553299c9d 100644 --- a/folly/experimental/io/IoUringBackend.h +++ b/folly/experimental/io/IoUringBackend.h @@ -14,1092 +14,4 @@ * limitations under the License. */ -#pragma once - -#include - -#include -#include -#include -#include - -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if __has_include() -#include -#endif - -#if FOLLY_HAS_LIBURING - -#include // @manual - -namespace folly { - -class IoUringBackend : public EventBaseBackendBase { - public: - class FOLLY_EXPORT NotAvailable : public std::runtime_error { - public: - using std::runtime_error::runtime_error; - }; - - struct Options { - enum Flags { - POLL_SQ = 0x1, - POLL_CQ = 0x2, - POLL_SQ_IMMEDIATE_IO = 0x4, // do not enqueue I/O operations - }; - - Options() = default; - - Options& setCapacity(size_t v) { - capacity = v; - return *this; - } - - Options& setMinCapacity(size_t v) { - minCapacity = v; - - return *this; - } - - Options& setMaxSubmit(size_t v) { - maxSubmit = v; - - return *this; - } - - Options& setSqeSize(size_t v) { - sqeSize = v; - - return *this; - } - - Options& setMaxGet(size_t v) { - maxGet = v; - - return *this; - } - - Options& setUseRegisteredFds(size_t v) { - registeredFds = v; - return *this; - } - - Options& setFlags(uint32_t v) { - flags = v; - - return *this; - } - - Options& setSQIdle(std::chrono::milliseconds v) { - sqIdle = v; - - return *this; - } - - Options& setCQIdle(std::chrono::milliseconds v) { - cqIdle = v; - - return *this; - } - - // Set the CPU as preferred for submission queue poll thread. - // - // This only has effect if POLL_SQ flag is specified. - // - // Can call multiple times to specify multiple CPUs. - Options& setSQCpu(uint32_t v) { - sqCpus.insert(v); - - return *this; - } - - // Set the preferred CPUs for submission queue poll thread(s). - // - // This only has effect if POLL_SQ flag is specified. - Options& setSQCpus(std::set const& cpus) { - sqCpus.insert(cpus.begin(), cpus.end()); - - return *this; - } - - Options& setSQGroupName(const std::string& v) { - sqGroupName = v; - - return *this; - } - - Options& setSQGroupNumThreads(size_t v) { - sqGroupNumThreads = v; - - return *this; - } - - Options& setInitialProvidedBuffers(size_t eachSize, size_t count) { - initialProvidedBuffersCount = count; - initialProvidedBuffersEachSize = eachSize; - return *this; - } - - Options& setRegisterRingFd(bool v) { - registerRingFd = v; - - return *this; - } - - Options& setTaskRunCoop(bool v) { - taskRunCoop = v; - - return *this; - } - - Options& setDeferTaskRun(bool v) { - deferTaskRun = v; - - return *this; - } - - Options& setTimeout(std::chrono::microseconds v) { - timeout = v; - - return *this; - } - - Options& setBatchSize(int v) { - batchSize = v; - - return *this; - } - - ssize_t sqeSize{-1}; - - size_t capacity{256}; - size_t minCapacity{0}; - size_t maxSubmit{128}; - size_t maxGet{256}; - size_t registeredFds{0}; - size_t sqGroupNumThreads{1}; - size_t initialProvidedBuffersCount{0}; - size_t initialProvidedBuffersEachSize{0}; - - uint32_t flags{0}; - - // Minimum number of requests (defined as sockets with data to read) to wait - // for per io_uring_enter - int batchSize{0}; - - bool registerRingFd{false}; - bool taskRunCoop{false}; - bool deferTaskRun{false}; - - // Maximum amount of time to wait (in microseconds) per io_uring_enter - // Both timeout _and_ batchSize must be set for io_uring_enter wait_nr to be - // set! - std::chrono::microseconds timeout{0}; - std::chrono::milliseconds sqIdle{0}; - std::chrono::milliseconds cqIdle{0}; - - std::set sqCpus; - - std::string sqGroupName; - }; - - explicit IoUringBackend(Options options); - ~IoUringBackend() override; - Options const& options() const { return options_; } - - bool isWaitingToSubmit() const { - return waitingToSubmit_ || !submitList_.empty(); - } - struct io_uring* ioRingPtr() { return &ioRing_; } - struct io_uring_params const& params() const { return params_; } - bool useReqBatching() const { - return options_.timeout.count() > 0 && options_.batchSize > 0; - } - - // from EventBaseBackendBase - int getPollableFd() const override { return ioRing_.ring_fd; } - - event_base* getEventBase() override { return nullptr; } - - int eb_event_base_loop(int flags) override; - int eb_event_base_loopbreak() override; - - int eb_event_add(Event& event, const struct timeval* timeout) override; - int eb_event_del(Event& event) override; - - bool eb_event_active(Event&, int) override { return false; } - - size_t loopPoll(); - void submitOutstanding(); - unsigned int processCompleted(); - - // returns true if the current Linux kernel version - // supports the io_uring backend - static bool isAvailable(); - bool kernelHasNonBlockWriteFixes() const; - static bool kernelSupportsRecvmsgMultishot(); - static bool kernelSupportsDeferTaskrun(); - static bool kernelSupportsSendZC(); - - IoUringFdRegistrationRecord* registerFd(int fd) noexcept { - return fdRegistry_.alloc(fd); - } - - bool unregisterFd(IoUringFdRegistrationRecord* rec) { - return fdRegistry_.free(rec); - } - - // CQ poll mode loop callback - using CQPollLoopCallback = folly::Function; - - void setCQPollLoopCallback(CQPollLoopCallback&& cb) { - cqPollLoopCallback_ = std::move(cb); - } - - // read/write/fsync/fdatasync file operation callback - // int param is the io_uring_cqe res field - // i.e. the result of the file operation - using FileOpCallback = folly::Function; - - void queueRead( - int fd, - void* buf, - unsigned int nbytes, - off_t offset, - FileOpCallback&& cb); - - void queueWrite( - int fd, - const void* buf, - unsigned int nbytes, - off_t offset, - FileOpCallback&& cb); - - void queueReadv( - int fd, - Range iovecs, - off_t offset, - FileOpCallback&& cb); - - void queueWritev( - int fd, - Range iovecs, - off_t offset, - FileOpCallback&& cb); - - // there is no ordering between the prev submitted write - // requests and the sync ops - // ordering can be achieved by calling queue*sync from one of - // the prev write callbacks, once all the write operations - // we have to wait for are done - void queueFsync(int fd, FileOpCallback&& cb); - void queueFdatasync(int fd, FileOpCallback&& cb); - - void queueOpenat( - int dfd, const char* path, int flags, mode_t mode, FileOpCallback&& cb); - - void queueOpenat2( - int dfd, const char* path, struct open_how* how, FileOpCallback&& cb); - - void queueClose(int fd, FileOpCallback&& cb); - - void queueStatx( - int dirfd, - const char* pathname, - int flags, - unsigned int mask, - struct statx* statxbuf, - FileOpCallback&& cb); - - void queueFallocate( - int fd, int mode, off_t offset, off_t len, FileOpCallback&& cb); - - // sendmgs/recvmsg - void queueSendmsg( - int fd, - const struct msghdr* msg, - unsigned int flags, - FileOpCallback&& cb); - - void queueRecvmsg( - int fd, struct msghdr* msg, unsigned int flags, FileOpCallback&& cb); - - void submit(IoSqeBase& ioSqe) { - // todo verify that the sqe is valid! - submitImmediateIoSqe(ioSqe); - } - - void submitNextLoop(IoSqeBase& ioSqe) noexcept; - void submitSoon(IoSqeBase& ioSqe) noexcept; - void submitNow(IoSqeBase& ioSqe); - void cancel(IoSqeBase* sqe); - - // built in buffer provider - IoUringBufferProviderBase* bufferProvider() { return bufferProvider_.get(); } - uint16_t nextBufferProviderGid() { return bufferProviderGidNext_++; } - - protected: - enum class WaitForEventsMode { WAIT, DONT_WAIT }; - - class SocketPair { - public: - SocketPair(); - - SocketPair(const SocketPair&) = delete; - SocketPair& operator=(const SocketPair&) = delete; - - ~SocketPair(); - - int readFd() const { return fds_[1]; } - - int writeFd() const { return fds_[0]; } - - private: - std::array fds_{-1, -1}; - }; - - struct UserData { - uintptr_t value; - explicit UserData(void* p) noexcept - : value{reinterpret_cast(p)} {} - /* implicit */ operator uint64_t() const noexcept { return value; } - /* implicit */ operator void*() const noexcept { - return reinterpret_cast(value); - } - }; - - static uint32_t getPollFlags(short events) { - uint32_t ret = 0; - if (events & EV_READ) { - ret |= POLLIN; - } - - if (events & EV_WRITE) { - ret |= POLLOUT; - } - - return ret; - } - - static short getPollEvents(uint32_t flags, short events) { - short ret = 0; - if (flags & POLLIN) { - ret |= EV_READ; - } - - if (flags & POLLOUT) { - ret |= EV_WRITE; - } - - if (flags & (POLLERR | POLLHUP)) { - ret |= (EV_READ | EV_WRITE); - } - - ret &= events; - - return ret; - } - - // timer processing - bool addTimerFd(); - void scheduleTimeout(); - void scheduleTimeout(const std::chrono::microseconds& us); - void addTimerEvent(Event& event, const struct timeval* timeout); - void removeTimerEvent(Event& event); - size_t processTimers(); - void setProcessTimers(); - - size_t processActiveEvents(); - - struct IoSqe; - - static void processPollIoSqe( - IoUringBackend* backend, IoSqe* ioSqe, int res, uint32_t flags); - static void processTimerIoSqe( - IoUringBackend* backend, - IoSqe* /*sqe*/, - int /*res*/, - uint32_t /* flags */); - static void processSignalReadIoSqe( - IoUringBackend* backend, - IoSqe* /*sqe*/, - int /*res*/, - uint32_t /* flags */); - - // signal handling - void addSignalEvent(Event& event); - void removeSignalEvent(Event& event); - bool addSignalFds(); - size_t processSignals(); - void setProcessSignals(); - - void processPollIo(IoSqe* ioSqe, int res, uint32_t flags) noexcept; - - IoSqe* FOLLY_NULLABLE allocIoSqe(const EventCallback& cb); - void releaseIoSqe(IoSqe* aioIoSqe) noexcept; - - // submit immediate if POLL_SQ | POLL_SQ_IMMEDIATE_IO flags are set - void submitImmediateIoSqe(IoSqeBase& ioSqe); - - void internalSubmit(IoSqeBase& ioSqe) noexcept; - - enum class InternalProcessCqeMode { - NORMAL, // process existing and any available - AVAILABLE_ONLY, // process existing but don't get more - CANCEL_ALL, // cancel every sqe - }; - unsigned int internalProcessCqe( - unsigned int maxGet, InternalProcessCqeMode mode) noexcept; - - int eb_event_modify_inserted(Event& event, IoSqe* ioSqe); - - struct FdRegistry { - FdRegistry() = delete; - FdRegistry(struct io_uring& ioRing, size_t n); - - IoUringFdRegistrationRecord* alloc(int fd) noexcept; - bool free(IoUringFdRegistrationRecord* record); - - int init(); - size_t update(); - - bool err_{false}; - struct io_uring& ioRing_; - std::vector files_; - size_t inUse_; - std::vector records_; - boost::intrusive:: - slist> - free_; - }; - - struct IoSqe : public IoSqeBase { - using BackendCb = void(IoUringBackend*, IoSqe*, int, uint32_t); - explicit IoSqe( - IoUringBackend* backend = nullptr, - bool poolAlloc = false, - bool persist = false) - : backend_(backend), poolAlloc_(poolAlloc), persist_(persist) {} - - void callback(const io_uring_cqe* cqe) noexcept override { - backendCb_(backend_, this, cqe->res, cqe->flags); - } - void callbackCancelled(const io_uring_cqe*) noexcept override { release(); } - virtual void release() noexcept; - - IoUringBackend* backend_; - BackendCb* backendCb_{nullptr}; - const bool poolAlloc_; - const bool persist_; - Event* event_{nullptr}; - IoUringFdRegistrationRecord* fdRecord_{nullptr}; - size_t useCount_{0}; - int res_; - uint32_t cqeFlags_; - - FOLLY_ALWAYS_INLINE void resetEvent() { - // remove it from the list - unlink(); - if (event_) { - event_->setUserData(nullptr); - event_ = nullptr; - } - } - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - auto* ev = event_->getEvent(); - if (ev) { - const auto& cb = event_->getCallback(); - switch (cb.type_) { - case EventCallback::Type::TYPE_NONE: - break; - case EventCallback::Type::TYPE_READ: - if (auto* iov = cb.readCb_->allocateData()) { - prepRead( - sqe, - ev->ev_fd, - &iov->data_, - 0, - (ev->ev_events & EV_PERSIST) != 0); - cbData_.set(iov); - return; - } - break; - case EventCallback::Type::TYPE_RECVMSG: - if (auto* msg = cb.recvmsgCb_->allocateData()) { - prepRecvmsg( - sqe, - ev->ev_fd, - &msg->data_, - (ev->ev_events & EV_PERSIST) != 0); - cbData_.set(msg); - return; - } - break; - case EventCallback::Type::TYPE_RECVMSG_MULTISHOT: - if (auto* hdr = - cb.recvmsgMultishotCb_->allocateRecvmsgMultishotData()) { - prepRecvmsgMultishot(sqe, ev->ev_fd, &hdr->data_); - cbData_.set(hdr); - return; - } - break; - } - prepPollAdd(sqe, ev->ev_fd, getPollFlags(ev->ev_events)); - } - } - - virtual void processActive() {} - - struct EventCallbackData { - EventCallback::Type type_{EventCallback::Type::TYPE_NONE}; - union { - EventReadCallback::IoVec* ioVec_; - EventRecvmsgCallback::MsgHdr* msgHdr_; - EventRecvmsgMultishotCallback::Hdr* hdr_; - }; - - void set(EventReadCallback::IoVec* ioVec) { - type_ = EventCallback::Type::TYPE_READ; - ioVec_ = ioVec; - } - - void set(EventRecvmsgCallback::MsgHdr* msgHdr) { - type_ = EventCallback::Type::TYPE_RECVMSG; - msgHdr_ = msgHdr; - } - - void set(EventRecvmsgMultishotCallback::Hdr* hdr) { - type_ = EventCallback::Type::TYPE_RECVMSG_MULTISHOT; - hdr_ = hdr; - } - - void reset() { type_ = EventCallback::Type::TYPE_NONE; } - - bool processCb(IoUringBackend* backend, int res, uint32_t flags) { - bool ret = false; - bool released = false; - switch (type_) { - case EventCallback::Type::TYPE_READ: { - released = ret = true; - auto cbFunc = ioVec_->cbFunc_; - cbFunc(ioVec_, res); - break; - } - case EventCallback::Type::TYPE_RECVMSG: { - released = ret = true; - auto cbFunc = msgHdr_->cbFunc_; - cbFunc(msgHdr_, res); - break; - } - case EventCallback::Type::TYPE_RECVMSG_MULTISHOT: { - ret = true; - std::unique_ptr buf; - if (flags & IORING_CQE_F_BUFFER) { - if (IoUringBufferProviderBase* bp = backend->bufferProvider()) { - buf = bp->getIoBuf(flags >> 16, res); - } - } - hdr_->cbFunc_(hdr_, res, std::move(buf)); - if (!(flags & IORING_CQE_F_MORE)) { - hdr_->freeFunc_(hdr_); - released = true; - } - break; - } - case EventCallback::Type::TYPE_NONE: - break; - } - - if (released) { - type_ = EventCallback::Type::TYPE_NONE; - } - - return ret; - } - - void releaseData() { - switch (type_) { - case EventCallback::Type::TYPE_READ: { - auto freeFunc = ioVec_->freeFunc_; - freeFunc(ioVec_); - break; - } - case EventCallback::Type::TYPE_RECVMSG: { - auto freeFunc = msgHdr_->freeFunc_; - freeFunc(msgHdr_); - break; - } - case EventCallback::Type::TYPE_RECVMSG_MULTISHOT: - hdr_->freeFunc_(hdr_); - break; - case EventCallback::Type::TYPE_NONE: - break; - } - type_ = EventCallback::Type::TYPE_NONE; - } - }; - - EventCallbackData cbData_; - - void prepPollAdd( - struct io_uring_sqe* sqe, int fd, uint32_t events) noexcept { - CHECK(sqe); - ::io_uring_prep_poll_add(sqe, fd, events); - ::io_uring_sqe_set_data(sqe, this); - } - - void prepRead( - struct io_uring_sqe* sqe, - int fd, - const struct iovec* iov, - off_t offset, - bool registerFd) noexcept { - prepUtilFunc( - ::io_uring_prep_read, - sqe, - registerFd, - fd, - iov->iov_base, - (unsigned int)iov->iov_len, - offset); - } - - void prepWrite( - struct io_uring_sqe* sqe, - int fd, - const struct iovec* iov, - off_t offset, - bool registerFd) noexcept { - prepUtilFunc( - ::io_uring_prep_write, - sqe, - registerFd, - fd, - iov->iov_base, - (unsigned int)iov->iov_len, - offset); - } - - void prepRecvmsg( - struct io_uring_sqe* sqe, - int fd, - struct msghdr* msg, - bool registerFd) noexcept { - prepUtilFunc( - ::io_uring_prep_recvmsg, sqe, registerFd, fd, msg, MSG_TRUNC); - } - - template - void prepUtilFunc( - Fn fn, - struct io_uring_sqe* sqe, - bool registerFd, - int fd, - Args... args) { - CHECK(sqe); - if (registerFd && !fdRecord_) { - fdRecord_ = backend_->registerFd(fd); - } - - if (fdRecord_) { - fn(sqe, fdRecord_->idx_, std::forward(args)...); - sqe->flags |= IOSQE_FIXED_FILE; - } else { - fn(sqe, fd, std::forward(args)...); - } - - ::io_uring_sqe_set_data(sqe, this); - } - - void prepRecvmsgMultishot( - struct io_uring_sqe* sqe, int fd, struct msghdr* msg) noexcept { - CHECK(sqe); - ::io_uring_prep_recvmsg(sqe, fd, msg, MSG_TRUNC); - // this magic value is set in io_uring_prep_recvmsg_multishot, - // however this version of the library isn't available widely yet - // so just hardcode it here - constexpr uint16_t kMultishotFlag = 1U << 1; - sqe->ioprio |= kMultishotFlag; - if (IoUringBufferProviderBase* bp = backend_->bufferProvider()) { - sqe->buf_group = bp->gid(); - sqe->flags |= IOSQE_BUFFER_SELECT; - } - ::io_uring_sqe_set_data(sqe, this); - } - - FOLLY_ALWAYS_INLINE void prepCancel( - struct io_uring_sqe* sqe, IoSqe* cancel_sqe) { - CHECK(sqe); - ::io_uring_prep_cancel(sqe, UserData{cancel_sqe}, 0); - ::io_uring_sqe_set_data(sqe, this); - } - }; - - using IoSqeBaseList = boost::intrusive:: - list>; - using IoSqeList = boost::intrusive:: - list>; - - struct FileOpIoSqe : public IoSqe { - FileOpIoSqe(IoUringBackend* backend, int fd, FileOpCallback&& cb) - : IoSqe(backend, false), fd_(fd), cb_(std::move(cb)) {} - - void processActive() override { cb_(res_); } - - int fd_{-1}; - - FileOpCallback cb_; - }; - - struct ReadWriteIoSqe : public FileOpIoSqe { - ReadWriteIoSqe( - IoUringBackend* backend, - int fd, - const struct iovec* iov, - off_t offset, - FileOpCallback&& cb) - : FileOpIoSqe(backend, fd, std::move(cb)), - iov_(iov, iov + 1), - offset_(offset) {} - - ReadWriteIoSqe( - IoUringBackend* backend, - int fd, - Range iov, - off_t offset, - FileOpCallback&& cb) - : FileOpIoSqe(backend, fd, std::move(cb)), iov_(iov), offset_(offset) {} - - void processActive() override { cb_(res_); } - - static constexpr size_t kNumInlineIoVec = 4; - folly::small_vector iov_; - off_t offset_; - }; - - struct ReadIoSqe : public ReadWriteIoSqe { - using ReadWriteIoSqe::ReadWriteIoSqe; - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - prepRead(sqe, fd_, iov_.data(), offset_, false); - } - }; - - struct WriteIoSqe : public ReadWriteIoSqe { - using ReadWriteIoSqe::ReadWriteIoSqe; - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - prepWrite(sqe, fd_, iov_.data(), offset_, false); - } - }; - - struct ReadvIoSqe : public ReadWriteIoSqe { - using ReadWriteIoSqe::ReadWriteIoSqe; - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - ::io_uring_prep_readv( - sqe, fd_, iov_.data(), (unsigned int)iov_.size(), offset_); - ::io_uring_sqe_set_data(sqe, this); - } - }; - - struct WritevIoSqe : public ReadWriteIoSqe { - using ReadWriteIoSqe::ReadWriteIoSqe; - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - ::io_uring_prep_writev( - sqe, fd_, iov_.data(), (unsigned int)iov_.size(), offset_); - ::io_uring_sqe_set_data(sqe, this); - } - }; - - enum class FSyncFlags { - FLAGS_FSYNC = 0, - FLAGS_FDATASYNC = 1, - }; - - struct FSyncIoSqe : public FileOpIoSqe { - FSyncIoSqe( - IoUringBackend* backend, int fd, FSyncFlags flags, FileOpCallback&& cb) - : FileOpIoSqe(backend, fd, std::move(cb)), flags_(flags) {} - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - unsigned int fsyncFlags = 0; - switch (flags_) { - case FSyncFlags::FLAGS_FSYNC: - fsyncFlags = 0; - break; - case FSyncFlags::FLAGS_FDATASYNC: - fsyncFlags = IORING_FSYNC_DATASYNC; - break; - } - - ::io_uring_prep_fsync(sqe, fd_, fsyncFlags); - ::io_uring_sqe_set_data(sqe, this); - } - - FSyncFlags flags_; - }; - - struct FOpenAtIoSqe : public FileOpIoSqe { - FOpenAtIoSqe( - IoUringBackend* backend, - int dfd, - const char* path, - int flags, - mode_t mode, - FileOpCallback&& cb) - : FileOpIoSqe(backend, dfd, std::move(cb)), - path_(path), - flags_(flags), - mode_(mode) {} - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - ::io_uring_prep_openat(sqe, fd_, path_.c_str(), flags_, mode_); - ::io_uring_sqe_set_data(sqe, this); - } - - std::string path_; - int flags_; - mode_t mode_; - }; - - struct FOpenAt2IoSqe : public FileOpIoSqe { - FOpenAt2IoSqe( - IoUringBackend* backend, - int dfd, - const char* path, - struct open_how* how, - FileOpCallback&& cb) - : FileOpIoSqe(backend, dfd, std::move(cb)), path_(path), how_(*how) {} - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - ::io_uring_prep_openat2(sqe, fd_, path_.c_str(), &how_); - ::io_uring_sqe_set_data(sqe, this); - } - - std::string path_; - struct open_how how_; - }; - - struct FCloseIoSqe : public FileOpIoSqe { - using FileOpIoSqe::FileOpIoSqe; - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - ::io_uring_prep_close(sqe, fd_); - ::io_uring_sqe_set_data(sqe, this); - } - }; - - struct FStatxIoSqe : public FileOpIoSqe { - FStatxIoSqe( - IoUringBackend* backend, - int dfd, - const char* pathname, - int flags, - unsigned int mask, - struct statx* statxbuf, - FileOpCallback&& cb) - : FileOpIoSqe(backend, dfd, std::move(cb)), - path_(pathname), - flags_(flags), - mask_(mask), - statxbuf_(statxbuf) {} - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - ::io_uring_prep_statx(sqe, fd_, path_, flags_, mask_, statxbuf_); - ::io_uring_sqe_set_data(sqe, this); - } - - const char* path_; - int flags_; - unsigned int mask_; - struct statx* statxbuf_; - }; - - struct FAllocateIoSqe : public FileOpIoSqe { - FAllocateIoSqe( - IoUringBackend* backend, - int fd, - int mode, - off_t offset, - off_t len, - FileOpCallback&& cb) - : FileOpIoSqe(backend, fd, std::move(cb)), - mode_(mode), - offset_(offset), - len_(len) {} - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - ::io_uring_prep_fallocate(sqe, fd_, mode_, offset_, len_); - ::io_uring_sqe_set_data(sqe, this); - } - - int mode_; - off_t offset_; - off_t len_; - }; - - struct SendmsgIoSqe : public FileOpIoSqe { - SendmsgIoSqe( - IoUringBackend* backend, - int fd, - const struct msghdr* msg, - unsigned int flags, - FileOpCallback&& cb) - : FileOpIoSqe(backend, fd, std::move(cb)), msg_(msg), flags_(flags) {} - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - ::io_uring_prep_sendmsg(sqe, fd_, msg_, flags_); - ::io_uring_sqe_set_data(sqe, this); - } - - const struct msghdr* msg_; - unsigned int flags_; - }; - - struct RecvmsgIoSqe : public FileOpIoSqe { - RecvmsgIoSqe( - IoUringBackend* backend, - int fd, - struct msghdr* msg, - unsigned int flags, - FileOpCallback&& cb) - : FileOpIoSqe(backend, fd, std::move(cb)), msg_(msg), flags_(flags) {} - - void processSubmit(struct io_uring_sqe* sqe) noexcept override { - ::io_uring_prep_recvmsg(sqe, fd_, msg_, flags_); - ::io_uring_sqe_set_data(sqe, this); - } - - struct msghdr* msg_; - unsigned int flags_; - }; - - size_t getActiveEvents(WaitForEventsMode waitForEvents); - size_t prepList(IoSqeBaseList& ioSqes); - int submitOne(); - int cancelOne(IoSqe* ioSqe); - - int submitBusyCheck(int num, WaitForEventsMode waitForEvents) noexcept; - int submitEager(); - - void queueFsync(int fd, FSyncFlags flags, FileOpCallback&& cb); - - void processFileOp(IoSqe* ioSqe, int res) noexcept; - - static void processFileOpCB( - IoUringBackend* backend, IoSqe* ioSqe, int res, uint32_t) { - static_cast(backend)->processFileOp(ioSqe, res); - } - - IoUringBackend::IoSqe* allocNewIoSqe(const EventCallback& /*cb*/) { - // allow pool alloc if numPooledIoSqeInUse_ < numEntries_ - auto* ret = new IoSqe(this, numPooledIoSqeInUse_ < numEntries_); - ++numPooledIoSqeInUse_; - ret->backendCb_ = IoUringBackend::processPollIoSqe; - - return ret; - } - - void cleanup(); - - struct io_uring_sqe* getUntrackedSqe(); - struct io_uring_sqe* getSqe(); - - /// some ring calls require being called on a single system thread, so we need - /// to delay init of those things until the correct thread is ready - void delayedInit(); - - /// init things that are linked to the io_uring submitter concept - /// so for DeferTaskrun, only do this in delayed init - void initSubmissionLinked(); - - Options options_; - size_t numEntries_; - std::unique_ptr timerEntry_; - std::unique_ptr signalReadEntry_; - IoSqeList freeList_; - bool usingDeferTaskrun_{false}; - - // timer related - int timerFd_{-1}; - bool timerChanged_{false}; - bool timerSet_{false}; - std::multimap timers_; - - // signal related - SocketPair signalFds_; - std::map> signals_; - - // submit - IoSqeBaseList submitList_; - uint16_t bufferProviderGidNext_{0}; - IoUringBufferProviderBase::UniquePtr bufferProvider_; - - // loop related - bool loopBreak_{false}; - bool shuttingDown_{false}; - bool processTimers_{false}; - bool processSignals_{false}; - IoSqeList activeEvents_; - size_t waitingToSubmit_{0}; - size_t numInsertedEvents_{0}; - size_t numInternalEvents_{0}; - size_t numSendEvents_{0}; - - // number of pooled IoSqe instances in use - size_t numPooledIoSqeInUse_{0}; - - // io_uring related - struct io_uring_params params_; - struct io_uring ioRing_; - - FdRegistry fdRegistry_; - - // poll callback to be invoked if POLL_CQ flag is set - // every time we poll for a CQE - CQPollLoopCallback cqPollLoopCallback_; - - bool needsDelayedInit_{true}; - - // stuff for ensuring we don't re-enter submit/getActiveEvents - folly::Optional submitTid_; - int isSubmitting_{0}; - bool gettingEvents_{false}; - void dCheckSubmitTid(); - void setSubmitting() noexcept { isSubmitting_++; } - void doneSubmitting() noexcept { isSubmitting_--; } - void setGetActiveEvents() { - if (kIsDebug && gettingEvents_) { - throw std::runtime_error("getting events is not reentrant"); - gettingEvents_ = true; - } - } - void doneGetActiveEvents() noexcept { gettingEvents_ = false; } - bool isSubmitting() const noexcept { return isSubmitting_; } -}; - -using PollIoBackend = IoUringBackend; -} // namespace folly - -#endif +#include diff --git a/folly/experimental/io/IoUringBase.h b/folly/experimental/io/IoUringBase.h index 0e46e367461..8bfccda7ccc 100644 --- a/folly/experimental/io/IoUringBase.h +++ b/folly/experimental/io/IoUringBase.h @@ -14,109 +14,4 @@ * limitations under the License. */ -#pragma once - -#include -#include -#include -#include - -struct io_uring_sqe; -struct io_uring_cqe; - -namespace folly { - -class IoUringBackend; - -struct IoSqeBase - : boost::intrusive::list_base_hook< - boost::intrusive::link_mode> { - enum class Type { - Unknown, - Read, - Write, - Open, - Close, - Connect, - Cancel, - }; - - IoSqeBase() : IoSqeBase(Type::Unknown) {} - explicit IoSqeBase(Type type) : type_(type) {} - // use raw addresses, so disallow copy/move - IoSqeBase(IoSqeBase&&) = delete; - IoSqeBase(const IoSqeBase&) = delete; - IoSqeBase& operator=(IoSqeBase&&) = delete; - IoSqeBase& operator=(const IoSqeBase&) = delete; - - virtual ~IoSqeBase() = default; - virtual void processSubmit(struct io_uring_sqe* sqe) noexcept = 0; - virtual void callback(const io_uring_cqe* cqe) noexcept = 0; - virtual void callbackCancelled(const io_uring_cqe* cqe) noexcept = 0; - IoSqeBase::Type type() const { return type_; } - bool inFlight() const { return inFlight_; } - bool cancelled() const { return cancelled_; } - void markCancelled() { cancelled_ = true; } - - protected: - // This is used if you want to prepare this sqe for reuse, but will manage the - // lifetime. For example for zerocopy send, you might want to reuse the sqe - // but still have a notification inbound. - void prepareForReuse() { internalUnmarkInflight(); } - - private: - friend class IoUringBackend; - void internalSubmit(struct io_uring_sqe* sqe) noexcept; - void internalCallback(const io_uring_cqe* cqe) noexcept; - void internalUnmarkInflight() { inFlight_ = false; } - - bool inFlight_ = false; - bool cancelled_ = false; - Type type_; -}; - -class IoUringBufferProviderBase { - protected: - uint16_t const gid_; - size_t const sizePerBuffer_; - - public: - struct Deleter { - void operator()(IoUringBufferProviderBase* base) { - if (base) { - base->destroy(); - } - } - }; - - using UniquePtr = std::unique_ptr; - explicit IoUringBufferProviderBase(uint16_t gid, size_t sizePerBuffer) - : gid_(gid), sizePerBuffer_(sizePerBuffer) {} - virtual ~IoUringBufferProviderBase() = default; - - IoUringBufferProviderBase(IoUringBufferProviderBase&&) = delete; - IoUringBufferProviderBase(IoUringBufferProviderBase const&) = delete; - IoUringBufferProviderBase& operator=(IoUringBufferProviderBase&&) = delete; - IoUringBufferProviderBase& operator=(IoUringBufferProviderBase const&) = - delete; - - size_t sizePerBuffer() const { return sizePerBuffer_; } - uint16_t gid() const { return gid_; } - - virtual uint32_t count() const noexcept = 0; - virtual void unusedBuf(uint16_t i) noexcept = 0; - virtual std::unique_ptr getIoBuf( - uint16_t i, size_t length) noexcept = 0; - virtual void enobuf() noexcept = 0; - virtual bool available() const noexcept = 0; - virtual void destroy() noexcept = 0; -}; - -struct IoUringFdRegistrationRecord : public boost::intrusive::slist_base_hook< - boost::intrusive::cache_last> { - int count_{0}; - int fd_{-1}; - int idx_{0}; -}; - -} // namespace folly +#include diff --git a/folly/experimental/io/IoUringEvent.h b/folly/experimental/io/IoUringEvent.h index c048aaffb6c..084baed0990 100644 --- a/folly/experimental/io/IoUringEvent.h +++ b/folly/experimental/io/IoUringEvent.h @@ -14,48 +14,4 @@ * limitations under the License. */ -#pragma once - -#include -#include -#include -#include -#include - -namespace folly { - -#if FOLLY_HAS_LIBURING - -class IoUringEvent : public EventHandler, public EventBase::LoopCallback { - public: - IoUringEvent( - folly::EventBase* eventBase, - IoUringBackend::Options const& o, - bool use_event_fd = true); - ~IoUringEvent() override; - - // cannot move/copy due to postLoopCallback - IoUringEvent const& operator=(IoUringEvent const&) = delete; - IoUringEvent&& operator=(IoUringEvent&&) = delete; - IoUringEvent(IoUringEvent&&) = delete; - IoUringEvent(IoUringEvent const&) = delete; - - void handlerReady(uint16_t events) noexcept override; - - void runLoopCallback() noexcept override; - - IoUringBackend& backend() { return backend_; } - - private: - bool hasWork(); - EventBase* eventBase_; - IoUringBackend backend_; - - bool lastWasResignalled_ = false; - bool edgeTriggered_ = false; - std::optional eventFd_; -}; - -#endif - -} // namespace folly +#include diff --git a/folly/experimental/io/IoUringEventBaseLocal.h b/folly/experimental/io/IoUringEventBaseLocal.h index 745d9149aa1..5e49f16f673 100644 --- a/folly/experimental/io/IoUringEventBaseLocal.h +++ b/folly/experimental/io/IoUringEventBaseLocal.h @@ -14,25 +14,4 @@ * limitations under the License. */ -#pragma once - -#include -#include -#include - -namespace folly { - -#if FOLLY_HAS_LIBURING - -class IoUringEventBaseLocal { - public: - static void attach( - EventBase* evb, - IoUringBackend::Options const& options, - bool use_eventfd = true); - static IoUringBackend* try_get(EventBase* evb); -}; - -#endif - -} // namespace folly +#include diff --git a/folly/experimental/io/IoUringProvidedBufferRing.h b/folly/experimental/io/IoUringProvidedBufferRing.h index b3dd1701593..6dc27e634d8 100644 --- a/folly/experimental/io/IoUringProvidedBufferRing.h +++ b/folly/experimental/io/IoUringProvidedBufferRing.h @@ -14,117 +14,4 @@ * limitations under the License. */ -#pragma once - -#include -#include -#include - -#if FOLLY_HAS_LIBURING - -#include // @manual - -namespace folly { - -class IoUringProvidedBufferRing : public IoUringBufferProviderBase { - public: - class LibUringCallError : public std::runtime_error { - public: - using std::runtime_error::runtime_error; - }; - - IoUringProvidedBufferRing( - io_uring* ioRingPtr, - uint16_t gid, - int count, - int bufferShift, - int ringSizeShift); - - void enobuf() noexcept override; - void unusedBuf(uint16_t i) noexcept override; - void destroy() noexcept override; - std::unique_ptr getIoBuf(uint16_t i, size_t length) noexcept override; - - uint32_t count() const noexcept override { return buffer_.bufferCount(); } - bool available() const noexcept override { - return !enobuf_.load(std::memory_order_relaxed); - } - - private: - void initialRegister(); - void returnBufferInShutdown() noexcept; - void returnBuffer(uint16_t i) noexcept; - - std::atomic* sharedTail() { - return reinterpret_cast*>(&buffer_.ring()->tail); - } - - bool tryPublish(uint16_t expected, uint16_t value) noexcept { - return sharedTail()->compare_exchange_strong( - expected, value, std::memory_order_release); - } - - char const* getData(uint16_t i) { return buffer_.buffer(i); } - - class ProvidedBuffersBuffer { - public: - ProvidedBuffersBuffer( - int count, int bufferShift, int ringCountShift, bool huge_pages); - ~ProvidedBuffersBuffer() { ::munmap(buffer_, allSize_); } - - static size_t calcBufferSize(int bufferShift) { - return 1LLU << std::max(5, bufferShift); - } - - struct io_uring_buf_ring* ring() const noexcept { return ringPtr_; } - - struct io_uring_buf* ringBuf(int idx) const noexcept { - return &ringPtr_->bufs[idx & ringMask_]; - } - - uint32_t bufferCount() const noexcept { return bufferCount_; } - uint32_t ringCount() const noexcept { return 1 + ringMask_; } - - char* buffer(uint16_t idx) { - size_t offset = (size_t)idx << bufferShift_; - return bufferBuffer_ + offset; - } - - size_t sizePerBuffer() const { return sizePerBuffer_; } - - private: - void* buffer_; - size_t allSize_; - - size_t ringMemSize_; - struct io_uring_buf_ring* ringPtr_; - int ringMask_; - - size_t bufferSize_; - size_t bufferShift_; - size_t sizePerBuffer_; - char* bufferBuffer_; - uint32_t bufferCount_; - - // static constexpr - static constexpr size_t kHugePageMask = (1LLU << 21) - 1; // 2MB - static constexpr size_t kPageMask = (1LLU << 12) - 1; // 4095 - static constexpr size_t kBufferAlignMask{31LLU}; - }; - - io_uring* ioRingPtr_; - ProvidedBuffersBuffer buffer_; - std::atomic enobuf_{false}; - std::vector ioBufCallbacks_; - - uint64_t gottenBuffers_{0}; - std::atomic returnedBuffers_{0}; - - std::atomic wantsShutdown_{false}; - std::atomic shutdownReferences_; - std::mutex shutdownMutex_; -}; - -} // namespace folly - -#endif +#include diff --git a/folly/experimental/io/Liburing.h b/folly/experimental/io/Liburing.h index 8e81aaae9ca..8a2bf6b2e54 100644 --- a/folly/experimental/io/Liburing.h +++ b/folly/experimental/io/Liburing.h @@ -14,10 +14,4 @@ * limitations under the License. */ -#pragma once - -#if defined(__linux__) && __has_include() -#define FOLLY_HAS_LIBURING 1 -#else -#define FOLLY_HAS_LIBURING 0 -#endif +#include diff --git a/folly/experimental/io/MuxIOThreadPoolExecutor.h b/folly/experimental/io/MuxIOThreadPoolExecutor.h index cfe25983eb5..84be88095ea 100644 --- a/folly/experimental/io/MuxIOThreadPoolExecutor.h +++ b/folly/experimental/io/MuxIOThreadPoolExecutor.h @@ -14,151 +14,4 @@ * limitations under the License. */ -#pragma once - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace folly { - -/** - * NOTE: This is highly experimental. Do not use. - * - * A pool of EventBases scheduled over a pool of threads. - * - * Intended as a drop-in replacement for folly::IOThreadPoolExecutor, but with a - * substantially different design: EventBases are not pinned to threads, so it - * is possible to have more EventBases than threads. EventBases that have ready - * events can be scheduled on any of the threads in the pool, with the - * scheduling governed by ThrottledLifoSem. - * - * This allows to batch the loops of multiple EventBases on a single thread as - * long as each runs for a short enough time, reducing the number of wake-ups - * and allowing for better load balancing across handlers. For example, we can - * create a large number of EventBases processed by a smaller number of threads - * and distribute the handlers. - * - * The number of EventBases is set at construction time and cannot be changed - * later. The number of threads can be changed dynamically, but setting it to 0 - * is not supported (otherwise no thread would be left to drive the EventBases) - * and it is not useful to run more threads than EventBases, so that is not - * supported either: attempting to set the number of threads to 0 or to a value - * greater than numEventBases() (either in construction or using - * setNumThreads()) will throw std::invalid_argument). - */ -class MuxIOThreadPoolExecutor : public IOThreadPoolExecutorBase { - public: - struct Options { - Options() {} - - Options& setEnableThreadIdCollection(bool b) { - enableThreadIdCollection = b; - return *this; - } - - Options& setNumEventBases(size_t num) { - numEventBases = num; - return *this; - } - - Options& setWakeUpInterval(std::chrono::nanoseconds w) { - wakeUpInterval = w; - return *this; - } - - Options& setIdleSpinMax(std::chrono::nanoseconds s) { - idleSpinMax = s; - return *this; - } - - bool enableThreadIdCollection{false}; - // If 0, the number of EventBases is set to the number of threads. - size_t numEventBases{0}; - std::chrono::nanoseconds wakeUpInterval{std::chrono::microseconds{100}}; - // Max spin for an idle thread waiting for work before going to sleep. - std::chrono::nanoseconds idleSpinMax = std::chrono::microseconds{10}; - }; - - explicit MuxIOThreadPoolExecutor( - size_t numThreads, - Options options = {}, - std::shared_ptr threadFactory = - std::make_shared("MuxIOTPEx"), - folly::EventBaseManager* ebm = folly::EventBaseManager::get()); - - ~MuxIOThreadPoolExecutor() override; - - size_t numEventBases() const { return numEventBases_; } - - void add(Func func) override; - void add( - Func func, - std::chrono::milliseconds expiration, - Func expireCallback = nullptr) override; - - folly::EventBase* getEventBase() override; - - // Returns all the EventBase instances - std::vector> getAllEventBases() - override; - - folly::EventBaseManager* getEventBaseManager() override; - - // Returns nullptr unless explicitly enabled through constructor - folly::WorkerProvider* getThreadIdCollector() override { - return threadIdCollector_.get(); - } - - void addObserver(std::shared_ptr o) override; - void removeObserver(std::shared_ptr o) override; - - void stop() override; - void join() override; - - private: - using EventBasePoller = folly::detail::EventBasePoller; - - struct EvbState; - - struct alignas(Thread) IOThread : public Thread { - EvbState* curEvbState; // Only accessed inside the worker thread. - }; - - void maybeUnregisterEventBases(Observer* o); - - void validateNumThreads(size_t numThreads) override; - ThreadPtr makeThread() override; - EvbState& pickEvbState(); - void threadRun(ThreadPtr thread) override; - void stopThreads(size_t n) override; - size_t getPendingTaskCountImpl() const override final; - - const Options options_; - const size_t numEventBases_; - folly::EventBaseManager* eventBaseManager_; - - std::unique_ptr fdGroup_; - std::vector> evbStates_; - std::vector> keepAlives_; - - relaxed_atomic nextEvb_{0}; - folly::ThreadLocal> thisThread_; - std::unique_ptr threadIdCollector_; - std::atomic pendingTasks_{0}; - - USPMCQueue readyQueue_; - folly::ThrottledLifoSem readyQueueSem_; -}; - -} // namespace folly +#include diff --git a/folly/experimental/io/SimpleAsyncIO.h b/folly/experimental/io/SimpleAsyncIO.h index fd7310016c2..cd9de25b709 100644 --- a/folly/experimental/io/SimpleAsyncIO.h +++ b/folly/experimental/io/SimpleAsyncIO.h @@ -14,201 +14,4 @@ * limitations under the License. */ -#pragma once - -#include - -#include -#include -#include -#include -#include -#include - -namespace folly { - -/** - * SimpleAsyncIO is a wrapper around AsyncIO intended to hide all the details. - * - * Usage: just create an instance of SimpleAsyncIO and then issue IO with - * pread and pwrite, no other effort required. e.g.: - * - * - * auto tmpfile = folly::File::temporary(); - * folly::SimpleAsyncIO aio; - * aio.pwrite( - * tmpfile.fd(), - * "hello world", - * 11, // size - * 0, // offset - * [](int rc) { LOG(INFO) << "Write completed with rc " << rc; }); - * - * - * IO is dispatched in the context of the calling thread; it may block briefly - * to obtain a lock on shared resources, but will *not* block for IO - * completion. If the IO queue is full (see setMaxRequests(size_t) in Config), - * IO fails with -EBUSY. - * - * IO is completed on the executor specified in the config (global CPU - * executor by default). - * - * IO is completed by calling the callback function provided to pread/pwrite. - * The single parameter to the callback is either a negative errno or the - * number of bytes transferred. - * - * There is a "hidden" EventBase which polls for IO completion and dispatches - * completion events to the executor. You may specify an existing EventBase in - * the config (and you are then responsible for making sure the EventBase - * instance outlives the SimpleAsyncIO instance). If you do not specify one, a - * ScopedEventBaseThread instance will be created. - * - * Following structure defines the configuration of a SimpleAsyncIO instance, - * in case you need to override the (sensible) defaults. - * - * Typical usage is something like: - * - * SimpleAsyncIO io(SimpleAsyncIO::Config() - * .setMaxRequests(100) - * .setMode(SimpleAsyncIO::Mode::IOURING)); - */ -class SimpleAsyncIO : public EventHandler { - public: - /** - * The asynchronized backend to be used: libaio or liburing - */ - enum Mode { - /// use libaio - AIO, - /// use liburing - IOURING - }; - /** - * The Config for SimpleAsyncIO on: - * - choosing backend implementation - * - executor to use for receiving completion - * - max requests are allowed - */ - struct Config { - Config() - : maxRequests_(1000), - completionExecutor_( - getKeepAliveToken(getUnsafeMutableGlobalCPUExecutor().get())), - mode_(AIO), - evb_(nullptr) {} - /// Maximum requests can be queued; -EBUSY returned for requests above - /// threshold - Config& setMaxRequests(size_t maxRequests) { - maxRequests_ = maxRequests; - return *this; - } - Config& setCompletionExecutor(Executor::KeepAlive<> completionExecutor) { - completionExecutor_ = completionExecutor; - return *this; - } - Config& setMode(Mode mode) { - mode_ = mode; - return *this; - } - Config& setEventBase(EventBase* evb) { - evb_ = evb; - return *this; - } - - private: - size_t maxRequests_; - Executor::KeepAlive<> completionExecutor_; - Mode mode_; - EventBase* evb_; - - friend class SimpleAsyncIO; - }; - - explicit SimpleAsyncIO(Config cfg = Config()); - virtual ~SimpleAsyncIO() override; - - using SimpleAsyncIOCompletor = Function; - - /** - * Initiate an asynchronous read request. - * - * Parameters and return value are same as pread(2). - * - * Completion is indicated by an asynchronous call to the given completor - * callback. The sole parameter to the callback is the result of the - * operation. - * - * @returns Same as pread(2) and if requests number reaches maxRequests_, - * return -EBUSY - */ - void pread( - int fd, - void* buf, - size_t size, - off_t start, - SimpleAsyncIOCompletor completor); - - /** - * Initiate an asynchronous write request. - * - * Parameters and return value are same as pwrite(2). - * - * Completion is indicated by an asynchronous call to the given completor - * callback. The sole parameter to the callback is the result of the - * operation. - * - * @returns Same as pwrite(2) and if requests number reaches maxRequests_, - * return -EBUSY - */ - void pwrite( - int fd, - const void* data, - size_t size, - off_t offset, - SimpleAsyncIOCompletor completor); - -#if FOLLY_HAS_COROUTINES - /** - * Coroutine version of pread(). - * - * Identical to pread() except that result is obtained by co_await instead of - * callback. - * - * @returns Same as pread(2) and if requests number reaches maxRequests_, - * return -EBUSY - */ - folly::coro::Task co_pread(int fd, void* buf, size_t size, off_t start); - /** - * Coroutine version of pwrite(). - * - * Identical to pwrite() except that result is obtained by co_await instead of - * callback. - * - * @returns Same as pwrite(2) and if requests number reaches maxRequests_, - * return -EBUSY - */ - folly::coro::Task co_pwrite( - int fd, const void* buf, size_t size, off_t start); -#endif - - private: - std::unique_ptr getOp(); - void putOp(std::unique_ptr&&); - - void submitOp( - Function preparer, SimpleAsyncIOCompletor completor); - - virtual void handlerReady(uint16_t events) noexcept override; - - template - void init(); - - size_t maxRequests_; - Executor::KeepAlive<> completionExecutor_; - std::unique_ptr asyncIO_; - Synchronized>> opsFreeList_; - std::unique_ptr evb_; - bool terminating_; - Baton<> drainedBaton_; -}; - -} // namespace folly +#include diff --git a/folly/experimental/io/test/BUCK b/folly/experimental/io/test/BUCK index 4cdf1c5ae39..5d1f9ffe72f 100644 --- a/folly/experimental/io/test/BUCK +++ b/folly/experimental/io/test/BUCK @@ -1,4 +1,3 @@ -load("@fbcode_macros//build_defs:cpp_binary.bzl", "cpp_binary") load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest") @@ -40,69 +39,6 @@ cpp_library( ], ) -cpp_unittest( - name = "async_io_test", - srcs = ["AsyncIOTest.cpp"], - supports_static_listing = False, - deps = [ - ":async_base_test_lib", - "//folly/experimental/io:async_io", - ], -) - -cpp_binary( - name = "io_uring_backend_bench", - srcs = ["IoUringBackendBench.cpp"], - headers = [], - deps = [ - "//folly:benchmark", - "//folly:file_util", - "//folly/experimental/io:epoll_backend", - "//folly/experimental/io:io_uring_backend", - "//folly/init:init", - "//folly/io/async:async_base", - "//folly/io/async:scoped_event_base_thread", - "//folly/portability:gflags", - ], -) - -cpp_unittest( - name = "io_uring_test", - srcs = ["IoUringTest.cpp"], - owner = "dmm@xmail.facebook.com", - supports_static_listing = False, - deps = [ - ":async_base_test_lib", - "//folly/experimental/io:io_uring", - "//folly/init:init", - ], -) - -cpp_unittest( - name = "io_uring_backend_setup_test", - srcs = ["IoUringBackendSetupTest.cpp"], - owner = "kvigor@xmail.facebook.com", - deps = [ - "//folly/experimental/io:io_uring_backend", - "//folly/portability:gtest", - ], -) - -cpp_binary( - name = "io_benchmark", - srcs = ["IOBenchmark.cpp"], - headers = [], - deps = [ - ":async_base_test_lib", - ":io_test_temp_file_util_lib", - "//folly:benchmark", - "//folly:file_util", - "//folly/experimental/io:async_io", - "//folly/experimental/io:io_uring", - "//folly/portability:gflags", - ], -) - cpp_library( name = "mux_io_thread_pool_executor_test_lib", srcs = ["MuxIOThreadPoolExecutorTest.cpp"], @@ -150,116 +86,3 @@ cpp_unittest( ":mux_io_thread_pool_executor_test_lib", # @manual ], ) - -cpp_unittest( - name = "epoll_backend_test", - srcs = ["EpollBackendTest.cpp"], - owner = "dmm@xmail.facebook.com", - supports_static_listing = False, - deps = [ - "//folly/experimental/io:epoll_backend", - "//folly/io/async/test:async_signal_handler_test_lib", - "//folly/io/async/test:event_base_test_lib", - ], -) - -cpp_unittest( - name = "io_uring_backend_test", - srcs = ["IoUringBackendTest.cpp"], - headers = [], - owner = "dmm@xmail.facebook.com", - supports_static_listing = False, - deps = [ - ":io_test_temp_file_util_lib", - "//folly:file_util", - "//folly:function", - "//folly:string", - "//folly/experimental/io:io_uring_backend", - "//folly/init:init", - "//folly/io/async:async_base", - "//folly/io/async:async_udp_server_socket", - "//folly/io/async:async_udp_socket", - "//folly/io/async/test:async_signal_handler_test_lib", - "//folly/io/async/test:event_base_test_lib", - "//folly/portability:gtest", - ], -) - -cpp_unittest( - name = "io_uring_event_test", - srcs = ["IoUringEventTest.cpp"], - owner = "dylany@xmail.facebook.com", - supports_static_listing = False, - deps = [ - "//folly/experimental/io:io_uring_backend", - "//folly/experimental/io:io_uring_event", - "//folly/futures:core", - "//folly/io/async:async_base", - "//folly/portability:gtest", - ], -) - -cpp_unittest( - name = "io_uring_event_base_local_test", - srcs = ["IoUringEventBaseLocalTest.cpp"], - owner = "dylany@xmail.facebook.com", - deps = [ - "//folly/experimental/io:io_uring_backend", - "//folly/experimental/io:io_uring_event_base_local", - "//folly/futures:core", - "//folly/portability:gtest", - ], -) - -cpp_unittest( - name = "async_io_uring_socket_test", - srcs = ["AsyncIoUringSocketTest.cpp"], - supports_static_listing = False, - deps = [ - "//folly:file_util", - "//folly:subprocess", - "//folly/executors:global_executor", - "//folly/experimental/io:async_io_uring_socket", - "//folly/experimental/io:io_uring_backend", - "//folly/experimental/io:io_uring_event", - "//folly/futures:core", - "//folly/io/async:async_base", - "//folly/io/async:async_socket", - "//folly/io/async:server_socket", - "//folly/portability:gtest", - "//folly/system:shell", - "//folly/test:socket_address_test_helper", - ], -) - -cpp_binary( - name = "registered_fd_benchmark", - srcs = ["RegisteredFdBenchmark.cpp"], - headers = [], - deps = [ - "//folly:benchmark", - "//folly:file_util", - "//folly/experimental/io:io_uring_backend", - "//folly/io/async:async_base", - "//folly/portability:gflags", - ], -) - -cpp_unittest( - name = "simple_async_io_test", - srcs = ["SimpleAsyncIOTest.cpp"], - supports_static_listing = False, - deps = [ - "//folly:file", - "//folly:random", - "//folly/experimental/coro:blocking_wait", - "//folly/experimental/coro:collect", - "//folly/experimental/io:simple_async_io", - "//folly/io:iobuf", - "//folly/portability:gtest", - "//folly/synchronization:baton", - ], - external_deps = [ - "glog", - ], -) diff --git a/folly/experimental/io/AsyncBase.cpp b/folly/io/async/AsyncBase.cpp similarity index 99% rename from folly/experimental/io/AsyncBase.cpp rename to folly/io/async/AsyncBase.cpp index 884664b6310..61cc6afa406 100644 --- a/folly/experimental/io/AsyncBase.cpp +++ b/folly/io/async/AsyncBase.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include diff --git a/folly/io/async/AsyncBase.h b/folly/io/async/AsyncBase.h new file mode 100644 index 00000000000..f72fd139e66 --- /dev/null +++ b/folly/io/async/AsyncBase.h @@ -0,0 +1,318 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace folly { +class AsyncIOOp; +class IoUringOp; +/** + * An AsyncBaseOp represents a pending operation. You may set a notification + * callback or you may use this class's methods directly. + * + * The op must remain allocated until it is completed or canceled. + */ +class AsyncBaseOp { + friend class AsyncBase; + + public: + using NotificationCallback = folly::Function; + + explicit AsyncBaseOp(NotificationCallback cb = NotificationCallback()); + AsyncBaseOp(const AsyncBaseOp&) = delete; + AsyncBaseOp& operator=(const AsyncBaseOp&) = delete; + virtual ~AsyncBaseOp(); + + enum class State { + UNINITIALIZED, + INITIALIZED, + PENDING, + COMPLETED, + CANCELED, + }; + + /** + * Initiate a read request. + */ + virtual void pread(int fd, void* buf, size_t size, off_t start) = 0; + void pread(int fd, Range range, off_t start) { + pread(fd, range.begin(), range.size(), start); + } + virtual void preadv(int fd, const iovec* iov, int iovcnt, off_t start) = 0; + virtual void pread( + int fd, void* buf, size_t size, off_t start, int /*buf_index*/) { + pread(fd, buf, size, start); + } + + /** + * Initiate a write request. + */ + virtual void pwrite(int fd, const void* buf, size_t size, off_t start) = 0; + void pwrite(int fd, Range range, off_t start) { + pwrite(fd, range.begin(), range.size(), start); + } + virtual void pwritev(int fd, const iovec* iov, int iovcnt, off_t start) = 0; + virtual void pwrite( + int fd, const void* buf, size_t size, off_t start, int /*buf_index*/) { + pwrite(fd, buf, size, start); + } + + // we support only these subclasses + virtual AsyncIOOp* getAsyncIOOp() = 0; + virtual IoUringOp* getIoUringOp() = 0; + + // ostream output + virtual void toStream(std::ostream& os) const = 0; + + /** + * Return the current operation state. + */ + State state() const { return state_; } + + /** + * user data get/set + */ + void* getUserData() const { return userData_; } + + void setUserData(void* userData) { userData_ = userData; } + + /** + * Reset the operation for reuse. It is an error to call reset() on + * an Op that is still pending. + */ + virtual void reset(NotificationCallback cb = NotificationCallback()) = 0; + + void setNotificationCallback(NotificationCallback cb) { cb_ = std::move(cb); } + + /** + * Get the notification callback from the op. + * + * Note that this moves the callback out, leaving the callback in an + * uninitialized state! You must call setNotificationCallback before + * submitting the operation! + */ + NotificationCallback getNotificationCallback() { return std::move(cb_); } + + /** + * Retrieve the result of this operation. Returns >=0 on success, + * -errno on failure (that is, using the Linux kernel error reporting + * conventions). Use checkKernelError (folly/Exception.h) on the result to + * throw a std::system_error in case of error instead. + * + * It is an error to call this if the Op hasn't completed. + */ + ssize_t result() const; + + // debug helper + static std::string fd2name(int fd); + + protected: + void init(); + void start(); + void unstart(); + void complete(ssize_t result); + void cancel(); + + NotificationCallback cb_; + std::atomic state_; + ssize_t result_; + void* userData_{nullptr}; +}; + +std::ostream& operator<<(std::ostream& os, const AsyncBaseOp& op); +std::ostream& operator<<(std::ostream& os, AsyncBaseOp::State state); + +/** + * Generic C++ interface around Linux IO(io_submit, io_uring) + */ +class AsyncBase { + public: + using Op = AsyncBaseOp; + + enum PollMode { + NOT_POLLABLE, + POLLABLE, + }; + + /** + * Create an AsyncBase context capable of holding at most 'capacity' pending + * requests at the same time. As requests complete, others can be scheduled, + * as long as this limit is not exceeded. + * + * If pollMode is POLLABLE, pollFd() will return a file descriptor that + * can be passed to poll / epoll / select and will become readable when + * any IOs on this AsyncBase have completed. If you do this, you must use + * pollCompleted() instead of wait() -- do not read from the pollFd() + * file descriptor directly. + * + * You may use the same AsyncBase object from multiple threads, as long as + * there is only one concurrent caller of wait() / pollCompleted() / cancel() + * (perhaps by always calling it from the same thread, or by providing + * appropriate mutual exclusion). In this case, pending() returns a snapshot + * of the current number of pending requests. + */ + explicit AsyncBase(size_t capacity, PollMode pollMode = NOT_POLLABLE); + AsyncBase(const AsyncBase&) = delete; + AsyncBase& operator=(const AsyncBase&) = delete; + virtual ~AsyncBase(); + + /** + * Initialize context + */ + virtual void initializeContext() = 0; + + /** + * Wait for at least minRequests to complete. Returns the requests that + * have completed; the returned range is valid until the next call to + * wait(). minRequests may be 0 to not block. + */ + Range wait(size_t minRequests); + + /** + * Cancel all pending requests and return them; the returned range is + * valid until the next call to cancel(). + */ + Range cancel(); + + /** + * Return the number of pending requests. + */ + size_t pending() const { return pending_; } + + /** + * Return the maximum number of requests that can be kept outstanding + * at any one time. + */ + size_t capacity() const { return capacity_; } + + /** + * Return the accumulative number of submitted I/O, since this object + * has been created. + */ + size_t totalSubmits() const { return submitted_; } + + /** + * If POLLABLE, return a file descriptor that can be passed to poll / epoll + * and will become readable when any async IO operations have completed. + * If NOT_POLLABLE, return -1. + */ + int pollFd() const { return pollFd_; } + + /** + * If POLLABLE, call instead of wait after the file descriptor returned + * by pollFd() became readable. The returned range is valid until the next + * call to pollCompleted(). + */ + Range pollCompleted(); + + /** + * Submit an op for execution. + */ + void submit(Op* op); + + /** + * Submit a range of ops for execution + */ + int submit(Range ops); + + protected: + virtual int drainPollFd() = 0; + void complete(Op* op, ssize_t result) { op->complete(result); } + + void cancel(Op* op) { op->cancel(); } + + bool isInit() const { return init_.load(std::memory_order_relaxed); } + + void decrementPending(size_t num = 1); + virtual int submitOne(AsyncBase::Op* op) = 0; + virtual int submitRange(Range ops) = 0; + + enum class WaitType { COMPLETE, CANCEL }; + virtual Range doWait( + WaitType type, + size_t minRequests, + size_t maxRequests, + std::vector& result) = 0; + + std::atomic init_{false}; + std::mutex initMutex_; + + std::atomic pending_{0}; + std::atomic submitted_{0}; + const size_t capacity_; + const PollMode pollMode_; + int pollFd_{-1}; + std::vector completed_; + std::vector canceled_; +}; + +/** + * Wrapper around AsyncBase that allows you to schedule more requests than + * the AsyncBase's object capacity. Other requests are queued and processed + * in a FIFO order. + */ +class AsyncBaseQueue { + public: + /** + * Create a queue, using the given AsyncBase object. + * The AsyncBase object may not be used by anything else until the + * queue is destroyed. + */ + explicit AsyncBaseQueue(AsyncBase* asyncBase); + ~AsyncBaseQueue(); + + size_t queued() const { return queue_.size(); } + + /** + * Submit an op to the AsyncBase queue. The op will be queued until + * the AsyncBase object has room. + */ + void submit(AsyncBaseOp* op); + + /** + * Submit a delayed op to the AsyncBase queue; this allows you to postpone + * creation of the Op (which may require allocating memory, etc) until + * the AsyncBase object has room. + */ + using OpFactory = std::function; + void submit(OpFactory op); + + private: + void onCompleted(AsyncBaseOp* op); + void maybeDequeue(); + + AsyncBase* asyncBase_; + + std::deque queue_; +}; + +} // namespace folly diff --git a/folly/experimental/io/AsyncIO.cpp b/folly/io/async/AsyncIO.cpp similarity index 99% rename from folly/experimental/io/AsyncIO.cpp rename to folly/io/async/AsyncIO.cpp index 5d2f894a511..11566a11dd2 100644 --- a/folly/experimental/io/AsyncIO.cpp +++ b/folly/io/async/AsyncIO.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include diff --git a/folly/io/async/AsyncIO.h b/folly/io/async/AsyncIO.h new file mode 100644 index 00000000000..b1a6f52eeed --- /dev/null +++ b/folly/io/async/AsyncIO.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#if __has_include() + +#include + +namespace folly { + +class AsyncIOOp : public AsyncBaseOp { + friend class AsyncIO; + friend std::ostream& operator<<(std::ostream& os, const AsyncIOOp& o); + + public: + explicit AsyncIOOp(NotificationCallback cb = NotificationCallback()); + AsyncIOOp(const AsyncIOOp&) = delete; + AsyncIOOp& operator=(const AsyncIOOp&) = delete; + ~AsyncIOOp() override; + + /** + * Initiate a read request. + */ + void pread(int fd, void* buf, size_t size, off_t start) override; + void preadv(int fd, const iovec* iov, int iovcnt, off_t start) override; + + /** + * Initiate a write request. + */ + void pwrite(int fd, const void* buf, size_t size, off_t start) override; + void pwritev(int fd, const iovec* iov, int iovcnt, off_t start) override; + + void reset(NotificationCallback cb = NotificationCallback()) override; + + AsyncIOOp* getAsyncIOOp() override { return this; } + + IoUringOp* getIoUringOp() override { return nullptr; } + + void toStream(std::ostream& os) const override; + + const iocb& getIocb() const { return iocb_; } + + private: + iocb iocb_; +}; + +std::ostream& operator<<(std::ostream& os, const AsyncIOOp& op); + +/** + * C++ interface around Linux Async IO. + */ +class AsyncIO : public AsyncBase { + public: + using Op = AsyncIOOp; + + /** + * Note: the maximum number of allowed concurrent requests is controlled + * by the fs.aio-max-nr sysctl, the default value is usually 64K. + */ + explicit AsyncIO(size_t capacity, PollMode pollMode = NOT_POLLABLE); + AsyncIO(const AsyncIO&) = delete; + AsyncIO& operator=(const AsyncIO&) = delete; + ~AsyncIO() override; + + void initializeContext() override; + + protected: + int drainPollFd() override; + int submitOne(AsyncBase::Op* op) override; + int submitRange(Range ops) override; + + private: + Range doWait( + WaitType type, + size_t minRequests, + size_t maxRequests, + std::vector& result) override; + + io_context_t ctx_{nullptr}; +}; + +using AsyncIOQueue = AsyncBaseQueue; +} // namespace folly + +#endif diff --git a/folly/experimental/io/AsyncIoUringSocket.cpp b/folly/io/async/AsyncIoUringSocket.cpp similarity index 99% rename from folly/experimental/io/AsyncIoUringSocket.cpp rename to folly/io/async/AsyncIoUringSocket.cpp index e2b5b88ee41..ccf84f0d98f 100644 --- a/folly/experimental/io/AsyncIoUringSocket.cpp +++ b/folly/io/async/AsyncIoUringSocket.cpp @@ -16,9 +16,9 @@ #include #include -#include #include #include +#include #include #include #include diff --git a/folly/io/async/AsyncIoUringSocket.h b/folly/io/async/AsyncIoUringSocket.h new file mode 100644 index 00000000000..63ad65d324f --- /dev/null +++ b/folly/io/async/AsyncIoUringSocket.h @@ -0,0 +1,517 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace folly { +class AsyncDetachFdCallback { + public: + virtual ~AsyncDetachFdCallback() = default; + virtual void fdDetached( + NetworkSocket ns, std::unique_ptr unread) noexcept = 0; + virtual void fdDetachFail(const AsyncSocketException& ex) noexcept = 0; +}; +} // namespace folly + +#if FOLLY_HAS_LIBURING +class IoUringBackend; + +namespace folly { + +class AsyncIoUringSocket : public AsyncSocketTransport { + public: + using Cert = folly::AsyncTransportCertificate; + struct Options { + Options() + : allocateNoBufferPoolBuffer(defaultAllocateNoBufferPoolBuffer), + multishotRecv(true) {} + + static std::unique_ptr defaultAllocateNoBufferPoolBuffer(); + folly::Function()> allocateNoBufferPoolBuffer; + folly::Optional zeroCopyEnable; + bool multishotRecv; + }; + + using UniquePtr = std::unique_ptr; + explicit AsyncIoUringSocket( + AsyncTransport::UniquePtr other, Options&& options = Options{}); + explicit AsyncIoUringSocket(AsyncSocket* sock, Options&& options = Options{}); + explicit AsyncIoUringSocket(EventBase* evb, Options&& options = Options{}); + explicit AsyncIoUringSocket( + EventBase* evb, NetworkSocket ns, Options&& options = Options{}); + + static bool supports(EventBase* backend); + + void connect( + AsyncSocket::ConnectCallback* callback, + const folly::SocketAddress& address, + std::chrono::milliseconds timeout = std::chrono::milliseconds(0), + SocketOptionMap const& options = emptySocketOptionMap, + const SocketAddress& bindAddr = anyAddress(), + const std::string& ifName = std::string()) noexcept; + + void connect( + ConnectCallback* callback, + const folly::SocketAddress& address, + int timeout, + SocketOptionMap const& options, + const SocketAddress& bindAddr, + const std::string& ifName) noexcept override { + connect( + callback, + address, + std::chrono::milliseconds(timeout), + options, + bindAddr, + ifName); + } + + std::chrono::nanoseconds getConnectTime() const { + return connectEndTime_ - connectStartTime_; + } + + // AsyncSocketBase + EventBase* getEventBase() const override { return evb_; } + + // AsyncReader + void setReadCB(ReadCallback* callback) override; + + ReadCallback* getReadCallback() const override { + return readSqe_->readCallback(); + } + std::unique_ptr takePreReceivedData() override { + return readSqe_->takePreReceivedData(); + } + + // AsyncWriter + void write(WriteCallback*, const void*, size_t, WriteFlags = WriteFlags::NONE) + override; + void writev( + WriteCallback*, + const iovec*, + size_t, + WriteFlags = WriteFlags::NONE) override; + void writeChain( + WriteCallback* callback, + std::unique_ptr&& buf, + WriteFlags flags) override; + bool canZC(std::unique_ptr const& buf) const; + + // AsyncTransport + void close() override; + void closeNow() override; + void closeWithReset() override; + void shutdownWrite() override; + void shutdownWriteNow() override; + + bool good() const override; + bool readable() const override { return good(); } + bool error() const override; + bool hangup() const override; + + bool connecting() const override { + return connectSqe_ && connectSqe_->inFlight(); + } + + void attachEventBase(EventBase*) override; + void detachEventBase() override; + bool isDetachable() const override; + + uint32_t getSendTimeout() const override { + return static_cast( + std::chrono::duration_cast(writeTimeoutTime_) + .count()); + } + + void setSendTimeout(uint32_t ms) override; + + void getLocalAddress(SocketAddress* address) const override; + + void getPeerAddress(SocketAddress*) const override; + + void setPreReceivedData(std::unique_ptr data) override; + void cacheAddresses() override; + + /** + * @return True iff end of record tracking is enabled + */ + bool isEorTrackingEnabled() const override { return false; } + + void setEorTracking(bool) override { + // don't support this. + // as far as I can see this is only used by AsyncSSLSocket, but TLS1.3 + // supercedes this so I think we can ignore it. + throw std::runtime_error( + "AsyncIoUringSocket::setEorTracking not supported"); + } + + size_t getAppBytesWritten() const override { return getRawBytesWritten(); } + size_t getRawBytesWritten() const override { return bytesWritten_; } + size_t getAppBytesReceived() const override { return getRawBytesReceived(); } + size_t getRawBytesReceived() const override; + + const AsyncTransport* getWrappedTransport() const override { return nullptr; } + + // AsyncSocketTransport + int setNoDelay(bool noDelay) override; + int setSockOpt( + int level, int optname, const void* optval, socklen_t optsize) override; + + std::string getSecurityProtocol() const override { return securityProtocol_; } + std::string getApplicationProtocol() const noexcept override { + return applicationProtocol_; + } + NetworkSocket getNetworkSocket() const override { return fd_; } + + void setSecurityProtocol(std::string s) { securityProtocol_ = std::move(s); } + void setApplicationProtocol(std::string s) { + applicationProtocol_ = std::move(s); + } + + const folly::AsyncTransportCertificate* getPeerCertificate() const override { + return peerCert_.get(); + } + + const folly::AsyncTransportCertificate* getSelfCertificate() const override { + return selfCert_.get(); + } + + void dropPeerCertificate() noexcept override { peerCert_.reset(); } + + void dropSelfCertificate() noexcept override { selfCert_.reset(); } + + void setPeerCertificate(const std::shared_ptr& peerCert) { + peerCert_ = peerCert; + } + + void setSelfCertificate(const std::shared_ptr& selfCert) { + selfCert_ = selfCert; + } + + void asyncDetachFd(AsyncDetachFdCallback* callback); + bool readSqeInFlight() const { return readSqe_->inFlight(); } + bool getTFOSucceded() const override; + void enableTFO() override { + // No-op if folly does not allow tfo +#if FOLLY_ALLOW_TFO + VLOG(5) << "AsyncIoUringSocket::enableTFO()"; + enableTFO_ = true; +#endif + } + + void appendPreReceive(std::unique_ptr iobuf) noexcept; + + protected: + ~AsyncIoUringSocket() override; + + private: + friend class ReadSqe; + friend class WriteSqe; + void setFd(NetworkSocket ns); + void registerFd(); + void unregisterFd(); + void readProcessSubmit( + struct io_uring_sqe* sqe, + IoUringBufferProviderBase* bufferProvider, + size_t* maxSize, + IoUringBufferProviderBase* usedBufferProvider) noexcept; + void readCallback( + int res, + uint32_t flags, + size_t maxSize, + IoUringBufferProviderBase* bufferProvider) noexcept; + void allowReads(); + void previousReadDone(); + void processWriteQueue() noexcept; + void setStateEstablished(); + void writeDone() noexcept; + void doSubmitWrite() noexcept; + void doReSubmitWrite() noexcept; + void failAllWrites() noexcept; + void submitRead(bool now = false); + void processConnectSubmit( + struct io_uring_sqe* sqe, sockaddr_storage& storage); + void processConnectResult(const io_uring_cqe* cqe); + void processConnectTimeout(); + void processFastOpenResult(const io_uring_cqe* cqe) noexcept; + void startSendTimeout(); + void sendTimeoutExpired(); + void failWrite(const AsyncSocketException& ex); + void readEOF(); + void readError(); + NetworkSocket takeFd(); + bool setZeroCopy(bool enable) override; + bool getZeroCopy() const override; + void setZeroCopyEnableFunc(AsyncWriter::ZeroCopyEnableFunc func) override; + + enum class State { + None, + Connecting, + Established, + Closed, + Error, + FastOpen, + }; + + static std::string toString(State s); + std::string stateAsString() const { return toString(state_); } + + struct ReadSqe : IoSqeBase, DelayedDestruction { + using UniquePtr = std::unique_ptr; + explicit ReadSqe(AsyncIoUringSocket* parent); + void processSubmit(struct io_uring_sqe* sqe) noexcept override; + void callback(const io_uring_cqe* cqe) noexcept override; + void callbackCancelled(const io_uring_cqe* cqe) noexcept override; + + void setReadCallback(ReadCallback* callback, bool submitNow); + ReadCallback* readCallback() const { return readCallback_; } + + size_t bytesReceived() const { return bytesReceived_; } + + std::unique_ptr takePreReceivedData(); + void appendPreReceive(std::unique_ptr data) noexcept { + appendReadData(std::move(data), preReceivedData_); + } + + void destroy() override { + parent_ = nullptr; + DelayedDestruction::destroy(); + } + + bool waitingForOldEventBaseRead() const; + void setOldEventBaseRead(folly::SemiFuture>&& f) { + oldEventBaseRead_ = std::move(f); + } + void attachEventBase(); + folly::Optional>> + detachEventBase(); + + private: + ~ReadSqe() override = default; + void appendReadData( + std::unique_ptr data, std::unique_ptr& overflow) noexcept; + void sendReadBuf( + std::unique_ptr buf, std::unique_ptr& overflow) noexcept; + bool readCallbackUseIoBufs() const; + void invalidState(ReadCallback* callback); + void processOldEventBaseRead(); + + IoUringBufferProviderBase* lastUsedBufferProvider_; + ReadCallback* readCallback_ = nullptr; + AsyncIoUringSocket* parent_; + size_t maxSize_; + uint64_t setReadCbCount_{0}; + size_t bytesReceived_{0}; + + std::unique_ptr queuedReceivedData_; + std::unique_ptr preReceivedData_; + std::unique_ptr tmpBuffer_; + bool supportsMultishotRecv_ = + false; // todo: this can be per process instead of per socket + + folly::Optional>> + oldEventBaseRead_; + std::shared_ptr alive_; + }; + + struct CloseSqe : IoSqeBase { + explicit CloseSqe(AsyncIoUringSocket* parent) + : IoSqeBase(IoSqeBase::Type::Close), parent_(parent) {} + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + parent_->closeProcessSubmit(sqe); + } + void callback(const io_uring_cqe*) noexcept override { delete this; } + void callbackCancelled(const io_uring_cqe*) noexcept override { + delete this; + } + AsyncIoUringSocket* parent_; + }; + + struct write_sqe_tag; + using write_sqe_hook = + boost::intrusive::list_base_hook>; + struct WriteSqe final : IoSqeBase, public write_sqe_hook { + explicit WriteSqe( + AsyncIoUringSocket* parent, + WriteCallback* callback, + std::unique_ptr&& buf, + WriteFlags flags, + bool zc); + ~WriteSqe() override { VLOG(5) << "~WriteSqe() " << this; } + + void processSubmit(struct io_uring_sqe* sqe) noexcept override; + void callback(const io_uring_cqe* cqe) noexcept override; + void callbackCancelled(const io_uring_cqe* cqe) noexcept override; + int sendMsgFlags() const; + std::pair< + folly::SemiFuture>>, + WriteSqe*> + detachEventBase(); + + boost::intrusive::list_member_hook<> member_hook_; + AsyncIoUringSocket* parent_; + WriteCallback* callback_; + std::unique_ptr buf_; + WriteFlags flags_; + static constexpr size_t kSmallIoVecSize = 16; + small_vector iov_; + size_t totalLength_; + struct msghdr msg_; + + bool zerocopy_{false}; + int refs_ = 1; + folly::Function detachedSignal_; + }; + using WriteSqeList = boost::intrusive::list< + WriteSqe, + boost::intrusive::base_hook, + boost::intrusive::constant_time_size>; + + class WriteTimeout : public AsyncTimeout { + public: + explicit WriteTimeout(AsyncIoUringSocket* socket) + : AsyncTimeout(socket->evb_), socket_(socket) {} + + void timeoutExpired() noexcept override { socket_->sendTimeoutExpired(); } + + private: + AsyncIoUringSocket* socket_; + }; + + struct ConnectSqe : IoSqeBase, AsyncTimeout { + explicit ConnectSqe(AsyncIoUringSocket* parent) + : IoSqeBase(IoSqeBase::Type::Connect), + AsyncTimeout(parent->evb_), + parent_(parent) {} + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + parent_->processConnectSubmit(sqe, addrStorage); + } + void callback(const io_uring_cqe* cqe) noexcept override { + parent_->processConnectResult(cqe); + } + void callbackCancelled(const io_uring_cqe*) noexcept override { + delete this; + } + void timeoutExpired() noexcept override { + if (!cancelled()) { + parent_->processConnectTimeout(); + } + } + AsyncIoUringSocket* parent_; + sockaddr_storage addrStorage; + }; + + struct FastOpenSqe : IoSqeBase { + explicit FastOpenSqe( + AsyncIoUringSocket* parent, + SocketAddress const& addr, + std::unique_ptr initialWrite); + void processSubmit(struct io_uring_sqe* sqe) noexcept override; + void cleanupMsg() noexcept; + void callback(const io_uring_cqe* cqe) noexcept override { + cleanupMsg(); + parent_->processFastOpenResult(cqe); + } + void callbackCancelled(const io_uring_cqe*) noexcept override { + delete this; + } + + AsyncIoUringSocket* parent_; + std::unique_ptr initialWrite; + size_t addrLen_; + sockaddr_storage addrStorage; + }; + + EventBase* evb_ = nullptr; + NetworkSocket fd_; + IoUringBackend* backend_ = nullptr; + Options options_; + mutable SocketAddress localAddress_; + mutable SocketAddress peerAddress_; + IoUringFdRegistrationRecord* fdRegistered_ = nullptr; + int usedFd_ = -1; + unsigned int mbFixedFileFlags_ = 0; + std::unique_ptr closeSqe_{new CloseSqe(this)}; + + State state_ = State::None; + + // read + friend struct DetachFdState; + ReadSqe::UniquePtr readSqe_; + + // write + std::chrono::milliseconds writeTimeoutTime_{0}; + WriteTimeout writeTimeout_{this}; + WriteSqe* writeSqeActive_ = nullptr; + WriteSqeList writeSqeQueue_; + size_t bytesWritten_{0}; + + // connect + std::unique_ptr connectSqe_; + AsyncSocket::ConnectCallback* connectCallback_; + std::chrono::milliseconds connectTimeout_{0}; + std::chrono::steady_clock::time_point connectStartTime_; + std::chrono::steady_clock::time_point connectEndTime_; + + // stopTLS helpers: + std::string securityProtocol_; + std::string applicationProtocol_; + + std::shared_ptr selfCert_; + std::shared_ptr peerCert_; + + // shutdown: + int shutdownFlags_ = 0; + + // TCP fast open + std::unique_ptr fastOpenSqe_; + bool enableTFO_ = false; + + // detach event base + bool isDetaching_ = false; + Optional>>> + detachedWriteResult_; + std::shared_ptr alive_; + + void closeProcessSubmit(struct io_uring_sqe* sqe); +}; +} // namespace folly + +#endif diff --git a/folly/io/async/AsyncIoUringSocketFactory.h b/folly/io/async/AsyncIoUringSocketFactory.h new file mode 100644 index 00000000000..9df4e13310c --- /dev/null +++ b/folly/io/async/AsyncIoUringSocketFactory.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace folly { + +class AsyncIoUringSocketFactory { + public: + static bool supports([[maybe_unused]] folly::EventBase* eb) { +#if FOLLY_HAS_LIBURING + return AsyncIoUringSocket::supports(eb); +#else + return false; +#endif + } + + template + static TWrapper create([[maybe_unused]] Args&&... args) { +#if FOLLY_HAS_LIBURING + return TWrapper(new AsyncIoUringSocket(std::forward(args)...)); +#else + throw std::runtime_error("AsyncIoUringSocket not supported"); +#endif + } + + static bool asyncDetachFd( + [[maybe_unused]] AsyncTransport& transport, + [[maybe_unused]] AsyncDetachFdCallback* callback) { +#if FOLLY_HAS_LIBURING + AsyncIoUringSocket* socket = + transport.getUnderlyingTransport(); + if (socket) { + socket->asyncDetachFd(callback); + return true; + } +#endif + + return false; + } +}; + +} // namespace folly diff --git a/folly/io/async/BUCK b/folly/io/async/BUCK index cf3b3fe8bab..3bb2936b902 100644 --- a/folly/io/async/BUCK +++ b/folly/io/async/BUCK @@ -535,3 +535,341 @@ cpp_library( "//folly/io/async:delayed_destruction", ], ) + +cpp_library( + name = "async_base_class", + srcs = ["AsyncBase.cpp"], + headers = ["AsyncBase.h"], + deps = [ + "//folly:exception", + "//folly:format", + "//folly:likely", + "//folly:string", + "//folly/portability:filesystem", + "//folly/portability:unistd", + ], + exported_deps = [ + "//folly:function", + "//folly:portability", + "//folly:range", + "//folly/portability:sys_uio", + ], + external_deps = [ + "boost", + "glog", + ], +) + +cpp_library( + name = "async_io", + srcs = ["AsyncIO.cpp"], + headers = ["AsyncIO.h"], + deps = [ + "fbsource//third-party/fmt:fmt", + "//folly:exception", + "//folly:likely", + "//folly:small_vector", + "//folly:string", + "//folly/portability:unistd", + ], + exported_deps = [ + "//folly/experimental/io:async_base", + ], + external_deps = [ + "boost", + "glog", + ], + exported_external_deps = [ + ("libaio", None, "aio"), + ], +) + +cpp_library( + # @autodeps-skip + name = "liburing", + headers = ["Liburing.h"], + os_deps = [( + "linux", + select({ + "DEFAULT": ["fbsource//third-party/liburing:uring"], + "ovr_config//os:linux-sgx": [], + }), + )], +) + +cpp_library( + name = "async_io_uring_socket", + srcs = [ + "AsyncIoUringSocket.cpp", + ], + headers = [ + "AsyncIoUringSocket.h", + "AsyncIoUringSocketFactory.h", + ], + deps = [ + "//folly:conv", + "//folly/detail:socket_fast_open", + "//folly/experimental/io:io_uring_event_base_local", + "//folly/memory:malloc", + "//folly/portability:sys_uio", + ], + exported_deps = [ + "//folly:network_address", + "//folly:optional", + "//folly:small_vector", + "//folly/experimental/io:io_uring_backend", + "//folly/experimental/io:liburing", + "//folly/futures:core", + "//folly/io:iobuf", + "//folly/io:socket_option_map", + "//folly/io/async:async_base", + "//folly/io/async:async_socket", + "//folly/io/async:async_socket_exception", + "//folly/io/async:async_transport", + "//folly/io/async:delayed_destruction", + "//folly/net:net_ops_dispatcher", + "//folly/portability:sockets", + ], + exported_external_deps = [ + "boost", + ], +) + +cpp_library( + name = "simple_async_io", + srcs = ["SimpleAsyncIO.cpp"], + headers = ["SimpleAsyncIO.h"], + deps = [ + "//folly:string", + "//folly/experimental/coro:baton", + "//folly/experimental/io:async_io", + "//folly/experimental/io:io_uring", + "//folly/experimental/io:liburing", + "//folly/portability:sockets", + ], + exported_deps = [ + "//folly:synchronized", + "//folly/executors:global_executor", + "//folly/experimental/coro:task", + "//folly/experimental/io:async_base", + "//folly/io/async:async_base", + "//folly/io/async:scoped_event_base_thread", + ], + exported_external_deps = [ + ], +) + +cpp_library( + name = "epoll", + headers = [ + "Epoll.h", + ], +) + +cpp_library( + name = "epoll_backend", + srcs = [ + "EpollBackend.cpp", + ], + headers = [ + "Epoll.h", + "EpollBackend.h", + ], + modular_headers = False, + deps = [ + "//folly:file_util", + "//folly:intrusive_list", + "//folly:map_util", + "//folly:string", + ], + exported_deps = [ + "//folly/container:intrusive_heap", + "//folly/io/async:async_base", + ], +) + +cpp_library( + name = "event_base_poller", + srcs = ["EventBasePoller.cpp"], + headers = ["EventBasePoller.h"], + deps = [ + "fbsource//third-party/fmt:fmt", + "//folly:file_util", + "//folly:string", + "//folly/experimental/io:epoll", + "//folly/experimental/io:liburing", + "//folly/lang:align", + "//folly/portability:gflags", + "//folly/synchronization:baton", + "//folly/system:thread_name", + ], + exported_deps = [ + "//folly:function", + "//folly:range", + "//folly:synchronized", + ], + external_deps = [ + "boost", + "glog", + ], +) + +cpp_library( + name = "mux_io_thread_pool_executor", + srcs = ["MuxIOThreadPoolExecutor.cpp"], + headers = ["MuxIOThreadPoolExecutor.h"], + deps = [ + "fbsource//third-party/fmt:fmt", + "//folly/container:enumerate", + "//folly/experimental/io:epoll_backend", + "//folly/lang:align", + "//folly/synchronization:latch", + ], + exported_deps = [ + "//folly:portability", + "//folly/concurrency:unbounded_queue", + "//folly/executors:io_thread_pool_executor", + "//folly/executors:queue_observer", + "//folly/experimental/io:event_base_poller", + "//folly/io/async:event_base_manager", + "//folly/synchronization:baton", + "//folly/synchronization:relaxed_atomic", + "//folly/synchronization:throttled_lifo_sem", + "//folly/synchronization:wait_options", + ], +) + +cpp_library( + name = "io_uring", + srcs = ["IoUring.cpp"], + headers = ["IoUring.h"], + modular_headers = False, + deps = [ + "fbsource//third-party/fmt:fmt", + "//folly:exception", + "//folly:likely", + "//folly:string", + "//folly/portability:unistd", + ], + exported_deps = [ + "//folly:shared_mutex", + "//folly/experimental/io:async_base", + "//folly/experimental/io:liburing", + ], + external_deps = [ + "boost", + "glog", + ], +) + +cpp_library( + name = "io_uring_backend", + srcs = [ + "IoUringBackend.cpp", + ], + headers = [ + "IoUringBackend.h", + "IoUringBase.h", + ], + modular_headers = False, + deps = [ + "//folly:demangle", + "//folly:file_util", + "//folly:glog", + "//folly:likely", + "//folly:spin_lock", + "//folly:string", + "//folly/container:f14_hash", + "//folly/experimental/io:io_uring_provided_buffer_ring", + "//folly/lang:bits", + "//folly/portability:gflags", + "//folly/portability:sockets", + "//folly/portability:sys_mman", + "//folly/portability:sys_syscall", + "//folly/synchronization:call_once", + ], + exported_deps = [ + "//folly:c_portability", + "//folly:conv", + "//folly:cpp_attributes", + "//folly:exception_string", + "//folly:function", + "//folly:optional", + "//folly:range", + "//folly:small_vector", + "//folly/experimental/io:liburing", + "//folly/io:iobuf", + "//folly/io/async:async_base", + "//folly/io/async:delayed_destruction", + "//folly/portability:asm", + ], + exported_external_deps = [ + "boost", + "glog", + ], +) + +cpp_library( + name = "io_uring_provided_buffer_ring", + srcs = [ + "IoUringProvidedBufferRing.cpp", + ], + headers = [ + "IoUringBase.h", + "IoUringProvidedBufferRing.h", + ], + modular_headers = False, + deps = [ + "//folly:conv", + "//folly:exception_string", + "//folly:string", + ], + exported_deps = [ + "//folly/experimental/io:liburing", + "//folly/io:iobuf", + "//folly/io/async:delayed_destruction", + "//folly/portability:sys_mman", + ], + exported_external_deps = [ + "boost", + ], +) + +cpp_library( + name = "io_uring_event", + srcs = [ + "IoUringEvent.cpp", + ], + headers = [ + "IoUringEvent.h", + ], + modular_headers = False, + exported_deps = [ + "//folly:file", + "//folly/experimental/io:io_uring_backend", + "//folly/experimental/io:liburing", + "//folly/io/async:async_base", + ], +) + +cpp_library( + name = "io_uring_event_base_local", + srcs = [ + "IoUringEventBaseLocal.cpp", + ], + headers = [ + "IoUringEventBaseLocal.h", + ], + modular_headers = False, + deps = [ + "//folly:singleton", + "//folly/experimental/io:io_uring_event", + ], + exported_deps = [ + "//folly/experimental/io:io_uring_backend", + "//folly/experimental/io:liburing", + "//folly/io/async:async_base", + ], + exported_external_deps = [ + ], +) diff --git a/folly/io/async/Epoll.h b/folly/io/async/Epoll.h new file mode 100644 index 00000000000..2cb342b6a0d --- /dev/null +++ b/folly/io/async/Epoll.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#if defined(__linux__) && __has_include() +#define FOLLY_HAS_EPOLL 1 +#else +#define FOLLY_HAS_EPOLL 0 +#endif diff --git a/folly/experimental/io/EpollBackend.cpp b/folly/io/async/EpollBackend.cpp similarity index 99% rename from folly/experimental/io/EpollBackend.cpp rename to folly/io/async/EpollBackend.cpp index 1273cd86819..0462aa49e39 100644 --- a/folly/experimental/io/EpollBackend.cpp +++ b/folly/io/async/EpollBackend.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include // @manual +#include // @manual #if FOLLY_HAS_EPOLL @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include diff --git a/folly/io/async/EpollBackend.h b/folly/io/async/EpollBackend.h new file mode 100644 index 00000000000..64c5949e94d --- /dev/null +++ b/folly/io/async/EpollBackend.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#if FOLLY_HAS_EPOLL + +#include +#include +#include +#include +#include + +#include +#include + +namespace folly { + +class EpollBackend : public EventBaseBackendBase { + public: + struct Options { + size_t numLoopEvents{128}; + + Options& setNumLoopEvents(size_t val) { + numLoopEvents = val; + return *this; + } + }; + + explicit EpollBackend(Options options); + ~EpollBackend() override; + + int getEpollFd() const { return epollFd_; } + + int getPollableFd() const override { return epollFd_; } + + event_base* getEventBase() override { return nullptr; } + + // Returns a non-standard value 2 when called with EVLOOP_NONBLOCK and the + // loop would block if called in a blocking fashion. + int eb_event_base_loop(int flags) override; + int eb_event_base_loopbreak() override; + + int eb_event_add(Event& event, const struct timeval* timeout) override; + int eb_event_del(Event& event) override; + + bool eb_event_active(Event&, int) override { return false; } + + bool setEdgeTriggered(Event& event) override; + + private: + struct TimerInfo; + + class SocketPair { + public: + SocketPair(); + + SocketPair(const SocketPair&) = delete; + SocketPair& operator=(const SocketPair&) = delete; + + ~SocketPair(); + + int readFd() const { return fds_[1]; } + + int writeFd() const { return fds_[0]; } + + private: + std::array fds_{{-1, -1}}; + }; + + void updateTimerFd(); + void addTimerEvent(Event& event, const struct timeval* timeout); + int removeTimerEvent(Event& event); + void processTimers(); + void setProcessTimers(); + + void addSignalEvent(Event& event); + int removeSignalEvent(Event& event); + void processSignals(); + + const Options options_; + + int epollFd_{-1}; + + size_t numInsertedEvents_{0}; + size_t numInternalEvents_{0}; + + bool loopBreak_{false}; + std::vector events_; // Cache allocation. + + int timerFd_{-1}; + std::optional timerFdExpiration_; + IntrusiveHeap timers_; + + SocketPair signalFds_; + std::map> signals_; +}; +} // namespace folly +#endif diff --git a/folly/experimental/io/EventBasePoller.cpp b/folly/io/async/EventBasePoller.cpp similarity index 99% rename from folly/experimental/io/EventBasePoller.cpp rename to folly/io/async/EventBasePoller.cpp index a40074564e4..4bfcc4209d9 100644 --- a/folly/experimental/io/EventBasePoller.cpp +++ b/folly/io/async/EventBasePoller.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include diff --git a/folly/io/async/EventBasePoller.h b/folly/io/async/EventBasePoller.h new file mode 100644 index 00000000000..2359fc399b2 --- /dev/null +++ b/folly/io/async/EventBasePoller.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include + +namespace folly::detail { + +/** + * EventBasePoller centralizes the blocking wait for events across multiple + * EventBases in a process. The singleton calls the provided ReadyCallback on + * ready EventBases, so they can be driven without blocking. This enables + * control over which threads drive the EventBases, as opposed to the standard + * blocking loop that requires one thread per EventBase. + * + * EventBases' pollable fds are registered in groups, so that the callback can + * batch processing of ready EventBases that belong to the same group. + * + * When the EventBase is ready it can be driven until it would block again, and + * then handoff() must be called to resume polling the fd. Neither the driving + * of the EventBase or the call to handoff() should happen inline in the + * callback, but delegated to another thread without blocking; the callback must + * return control quickly, as it executes in the main polling loop and can slow + * down the handling of all other registered EventBases. + * + * Note that none of the implementation is specific to EventBases, in fact this + * is a lightweight implementation of an event loop specialized on polling read + * events, and which supports grouping of the fds for batch-handling. The class + * could be easily generalized if other applications arise. + */ +class EventBasePoller { + public: + struct Stats { + using Duration = std::chrono::steady_clock::duration; + + // Track number of loop wake-ups and number of events returned. + int minNumEvents{std::numeric_limits::max()}; + int maxNumEvents{std::numeric_limits::min()}; + size_t totalNumEvents{0}; + size_t totalWakeups{0}; + + Duration totalWait{0}; + Duration minWait{Duration::max()}; + Duration maxWait{Duration::min()}; + + Duration totalBusy{0}; + Duration minBusy{Duration::max()}; + Duration maxBusy{Duration::min()}; + + void update(int numEvents, Duration wait, Duration busy); + }; + + class Handle { + public: + virtual ~Handle(); + + template + T* getUserData() const { + return reinterpret_cast(userData_); + } + + // If done is set to true, the handle is not re-armed and can be reclaimed + // with reclaim(). + virtual void handoff(bool done) = 0; + + protected: + friend class EventBasePoller; + + explicit Handle(void* userData) : userData_(userData) {} + + void* userData_; + }; + + // FdGroup method invocations must be serialized. + class FdGroup { + public: + virtual ~FdGroup(); + + // All added handles must be reclaimed before the group is destroyed. + virtual std::unique_ptr add(int fd, void* userData) = 0; + // Blocks until handoff(true) is called on the handle. + virtual void reclaim(std::unique_ptr handle) = 0; + }; + + using ReadyCallback = + Function readyHandles) const noexcept>; + + static EventBasePoller& get(); + + virtual ~EventBasePoller(); + + virtual std::unique_ptr makeFdGroup(ReadyCallback readyCallback) = 0; + + Stats getStats() { return stats_.copy(); } + + protected: + folly::Synchronized stats_; +}; + +} // namespace folly::detail diff --git a/folly/experimental/io/IoUring.cpp b/folly/io/async/IoUring.cpp similarity index 99% rename from folly/experimental/io/IoUring.cpp rename to folly/io/async/IoUring.cpp index 5ff3e97a515..2092da6e698 100644 --- a/folly/experimental/io/IoUring.cpp +++ b/folly/io/async/IoUring.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include diff --git a/folly/io/async/IoUring.h b/folly/io/async/IoUring.h new file mode 100644 index 00000000000..9e1d1592587 --- /dev/null +++ b/folly/io/async/IoUring.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#if FOLLY_HAS_LIBURING + +#include // @manual + +namespace folly { + +/** + * An IoUringOp represents a pending operation. You may set a notification + * callback or you may use this class's methods directly. + * + * The op must remain allocated until it is completed or canceled. + */ +class IoUringOp : public AsyncBaseOp { + friend class IoUring; + friend std::ostream& operator<<(std::ostream& stream, const IoUringOp& o); + + public: + struct Options { + Options() : sqe128(false), cqe32(false) {} + bool sqe128; + bool cqe32; + + bool operator==(const Options& options) const { + return sqe128 == options.sqe128 && cqe32 == options.cqe32; + } + + bool operator!=(const Options& options) const { + return !operator==(options); + } + }; + + IoUringOp( + NotificationCallback cb = NotificationCallback(), + Options options = Options()); + IoUringOp(const IoUringOp&) = delete; + IoUringOp& operator=(const IoUringOp&) = delete; + ~IoUringOp() override; + + /** + * Initiate a read request. + */ + void pread(int fd, void* buf, size_t size, off_t start) override; + void preadv(int fd, const iovec* iov, int iovcnt, off_t start) override; + void pread( + int fd, void* buf, size_t size, off_t start, int buf_index) override; + + /** + * Initiate a write request. + */ + void pwrite(int fd, const void* buf, size_t size, off_t start) override; + void pwritev(int fd, const iovec* iov, int iovcnt, off_t start) override; + void pwrite(int fd, const void* buf, size_t size, off_t start, int buf_index) + override; + + void reset(NotificationCallback cb = NotificationCallback()) override; + + AsyncIOOp* getAsyncIOOp() override { return nullptr; } + + IoUringOp* getIoUringOp() override { return this; } + + void toStream(std::ostream& os) const override; + + void initBase() { init(); } + + struct io_uring_sqe& getSqe() { return sqe_.sqe; } + + size_t getSqeSize() const { + return options_.sqe128 ? 128 : sizeof(struct io_uring_sqe); + } + + const struct io_uring_cqe& getCqe() const { + return *reinterpret_cast(&cqe_); + } + + size_t getCqeSize() const { + return options_.cqe32 ? 32 : sizeof(struct io_uring_cqe); + } + + void setCqe(const struct io_uring_cqe* cqe) { + ::memcpy(&cqe_, cqe, getCqeSize()); + } + + const Options& getOptions() const { return options_; } + + private: + Options options_; + + // we use unions with the largest size to avoid + // indidual allocations for the sqe/cqe + union { + struct io_uring_sqe sqe; + uint8_t data[128]; + } sqe_; + + // we have to use a union here because of -Wgnu-variable-sized-type-not-at-end + //__u64 big_cqe[]; + union { + __u64 user_data; // first member from from io_uring_cqe + uint8_t data[32]; + } cqe_; + + struct iovec iov_[1]; +}; + +std::ostream& operator<<(std::ostream& stream, const IoUringOp& op); + +/** + * C++ interface around Linux io_uring + */ +class IoUring : public AsyncBase { + public: + using Op = IoUringOp; + + /** + * Note: the maximum number of allowed concurrent requests is controlled + * by the kernel IORING_MAX_ENTRIES and the memlock limit, + * The default IORING_MAX_ENTRIES value is usually 32K. + */ + explicit IoUring( + size_t capacity, + PollMode pollMode = NOT_POLLABLE, + size_t maxSubmit = 1, + IoUringOp::Options options = IoUringOp::Options()); + IoUring(const IoUring&) = delete; + IoUring& operator=(const IoUring&) = delete; + ~IoUring() override; + + static bool isAvailable(); + + const IoUringOp::Options& getOptions() const { return options_; } + + int register_buffers(const struct iovec* iovecs, unsigned int nr_iovecs); + + int unregister_buffers(); + + void initializeContext() override; + + protected: + int drainPollFd() override; + int submitOne(AsyncBase::Op* op) override; + int submitRange(Range ops) override; + + private: + Range doWait( + WaitType type, + size_t minRequests, + size_t maxRequests, + std::vector& result) override; + + size_t maxSubmit_; + IoUringOp::Options options_; + struct io_uring_params params_; + struct io_uring ioRing_; + mutable SharedMutex submitMutex_; +}; + +using IoUringQueue = AsyncBaseQueue; +} // namespace folly + +#endif diff --git a/folly/experimental/io/IoUringBackend.cpp b/folly/io/async/IoUringBackend.cpp similarity index 99% rename from folly/experimental/io/IoUringBackend.cpp rename to folly/io/async/IoUringBackend.cpp index 1b90ba4310e..2033acb0ca0 100644 --- a/folly/experimental/io/IoUringBackend.cpp +++ b/folly/io/async/IoUringBackend.cpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/folly/io/async/IoUringBackend.h b/folly/io/async/IoUringBackend.h new file mode 100644 index 00000000000..c09dd7bb5b5 --- /dev/null +++ b/folly/io/async/IoUringBackend.h @@ -0,0 +1,1105 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __has_include() +#include +#endif + +#if FOLLY_HAS_LIBURING + +#include // @manual + +namespace folly { + +class IoUringBackend : public EventBaseBackendBase { + public: + class FOLLY_EXPORT NotAvailable : public std::runtime_error { + public: + using std::runtime_error::runtime_error; + }; + + struct Options { + enum Flags { + POLL_SQ = 0x1, + POLL_CQ = 0x2, + POLL_SQ_IMMEDIATE_IO = 0x4, // do not enqueue I/O operations + }; + + Options() = default; + + Options& setCapacity(size_t v) { + capacity = v; + return *this; + } + + Options& setMinCapacity(size_t v) { + minCapacity = v; + + return *this; + } + + Options& setMaxSubmit(size_t v) { + maxSubmit = v; + + return *this; + } + + Options& setSqeSize(size_t v) { + sqeSize = v; + + return *this; + } + + Options& setMaxGet(size_t v) { + maxGet = v; + + return *this; + } + + Options& setUseRegisteredFds(size_t v) { + registeredFds = v; + return *this; + } + + Options& setFlags(uint32_t v) { + flags = v; + + return *this; + } + + Options& setSQIdle(std::chrono::milliseconds v) { + sqIdle = v; + + return *this; + } + + Options& setCQIdle(std::chrono::milliseconds v) { + cqIdle = v; + + return *this; + } + + // Set the CPU as preferred for submission queue poll thread. + // + // This only has effect if POLL_SQ flag is specified. + // + // Can call multiple times to specify multiple CPUs. + Options& setSQCpu(uint32_t v) { + sqCpus.insert(v); + + return *this; + } + + // Set the preferred CPUs for submission queue poll thread(s). + // + // This only has effect if POLL_SQ flag is specified. + Options& setSQCpus(std::set const& cpus) { + sqCpus.insert(cpus.begin(), cpus.end()); + + return *this; + } + + Options& setSQGroupName(const std::string& v) { + sqGroupName = v; + + return *this; + } + + Options& setSQGroupNumThreads(size_t v) { + sqGroupNumThreads = v; + + return *this; + } + + Options& setInitialProvidedBuffers(size_t eachSize, size_t count) { + initialProvidedBuffersCount = count; + initialProvidedBuffersEachSize = eachSize; + return *this; + } + + Options& setRegisterRingFd(bool v) { + registerRingFd = v; + + return *this; + } + + Options& setTaskRunCoop(bool v) { + taskRunCoop = v; + + return *this; + } + + Options& setDeferTaskRun(bool v) { + deferTaskRun = v; + + return *this; + } + + Options& setTimeout(std::chrono::microseconds v) { + timeout = v; + + return *this; + } + + Options& setBatchSize(int v) { + batchSize = v; + + return *this; + } + + ssize_t sqeSize{-1}; + + size_t capacity{256}; + size_t minCapacity{0}; + size_t maxSubmit{128}; + size_t maxGet{256}; + size_t registeredFds{0}; + size_t sqGroupNumThreads{1}; + size_t initialProvidedBuffersCount{0}; + size_t initialProvidedBuffersEachSize{0}; + + uint32_t flags{0}; + + // Minimum number of requests (defined as sockets with data to read) to wait + // for per io_uring_enter + int batchSize{0}; + + bool registerRingFd{false}; + bool taskRunCoop{false}; + bool deferTaskRun{false}; + + // Maximum amount of time to wait (in microseconds) per io_uring_enter + // Both timeout _and_ batchSize must be set for io_uring_enter wait_nr to be + // set! + std::chrono::microseconds timeout{0}; + std::chrono::milliseconds sqIdle{0}; + std::chrono::milliseconds cqIdle{0}; + + std::set sqCpus; + + std::string sqGroupName; + }; + + explicit IoUringBackend(Options options); + ~IoUringBackend() override; + Options const& options() const { return options_; } + + bool isWaitingToSubmit() const { + return waitingToSubmit_ || !submitList_.empty(); + } + struct io_uring* ioRingPtr() { return &ioRing_; } + struct io_uring_params const& params() const { return params_; } + bool useReqBatching() const { + return options_.timeout.count() > 0 && options_.batchSize > 0; + } + + // from EventBaseBackendBase + int getPollableFd() const override { return ioRing_.ring_fd; } + + event_base* getEventBase() override { return nullptr; } + + int eb_event_base_loop(int flags) override; + int eb_event_base_loopbreak() override; + + int eb_event_add(Event& event, const struct timeval* timeout) override; + int eb_event_del(Event& event) override; + + bool eb_event_active(Event&, int) override { return false; } + + size_t loopPoll(); + void submitOutstanding(); + unsigned int processCompleted(); + + // returns true if the current Linux kernel version + // supports the io_uring backend + static bool isAvailable(); + bool kernelHasNonBlockWriteFixes() const; + static bool kernelSupportsRecvmsgMultishot(); + static bool kernelSupportsDeferTaskrun(); + static bool kernelSupportsSendZC(); + + IoUringFdRegistrationRecord* registerFd(int fd) noexcept { + return fdRegistry_.alloc(fd); + } + + bool unregisterFd(IoUringFdRegistrationRecord* rec) { + return fdRegistry_.free(rec); + } + + // CQ poll mode loop callback + using CQPollLoopCallback = folly::Function; + + void setCQPollLoopCallback(CQPollLoopCallback&& cb) { + cqPollLoopCallback_ = std::move(cb); + } + + // read/write/fsync/fdatasync file operation callback + // int param is the io_uring_cqe res field + // i.e. the result of the file operation + using FileOpCallback = folly::Function; + + void queueRead( + int fd, + void* buf, + unsigned int nbytes, + off_t offset, + FileOpCallback&& cb); + + void queueWrite( + int fd, + const void* buf, + unsigned int nbytes, + off_t offset, + FileOpCallback&& cb); + + void queueReadv( + int fd, + Range iovecs, + off_t offset, + FileOpCallback&& cb); + + void queueWritev( + int fd, + Range iovecs, + off_t offset, + FileOpCallback&& cb); + + // there is no ordering between the prev submitted write + // requests and the sync ops + // ordering can be achieved by calling queue*sync from one of + // the prev write callbacks, once all the write operations + // we have to wait for are done + void queueFsync(int fd, FileOpCallback&& cb); + void queueFdatasync(int fd, FileOpCallback&& cb); + + void queueOpenat( + int dfd, const char* path, int flags, mode_t mode, FileOpCallback&& cb); + + void queueOpenat2( + int dfd, const char* path, struct open_how* how, FileOpCallback&& cb); + + void queueClose(int fd, FileOpCallback&& cb); + + void queueStatx( + int dirfd, + const char* pathname, + int flags, + unsigned int mask, + struct statx* statxbuf, + FileOpCallback&& cb); + + void queueFallocate( + int fd, int mode, off_t offset, off_t len, FileOpCallback&& cb); + + // sendmgs/recvmsg + void queueSendmsg( + int fd, + const struct msghdr* msg, + unsigned int flags, + FileOpCallback&& cb); + + void queueRecvmsg( + int fd, struct msghdr* msg, unsigned int flags, FileOpCallback&& cb); + + void submit(IoSqeBase& ioSqe) { + // todo verify that the sqe is valid! + submitImmediateIoSqe(ioSqe); + } + + void submitNextLoop(IoSqeBase& ioSqe) noexcept; + void submitSoon(IoSqeBase& ioSqe) noexcept; + void submitNow(IoSqeBase& ioSqe); + void cancel(IoSqeBase* sqe); + + // built in buffer provider + IoUringBufferProviderBase* bufferProvider() { return bufferProvider_.get(); } + uint16_t nextBufferProviderGid() { return bufferProviderGidNext_++; } + + protected: + enum class WaitForEventsMode { WAIT, DONT_WAIT }; + + class SocketPair { + public: + SocketPair(); + + SocketPair(const SocketPair&) = delete; + SocketPair& operator=(const SocketPair&) = delete; + + ~SocketPair(); + + int readFd() const { return fds_[1]; } + + int writeFd() const { return fds_[0]; } + + private: + std::array fds_{-1, -1}; + }; + + struct UserData { + uintptr_t value; + explicit UserData(void* p) noexcept + : value{reinterpret_cast(p)} {} + /* implicit */ operator uint64_t() const noexcept { return value; } + /* implicit */ operator void*() const noexcept { + return reinterpret_cast(value); + } + }; + + static uint32_t getPollFlags(short events) { + uint32_t ret = 0; + if (events & EV_READ) { + ret |= POLLIN; + } + + if (events & EV_WRITE) { + ret |= POLLOUT; + } + + return ret; + } + + static short getPollEvents(uint32_t flags, short events) { + short ret = 0; + if (flags & POLLIN) { + ret |= EV_READ; + } + + if (flags & POLLOUT) { + ret |= EV_WRITE; + } + + if (flags & (POLLERR | POLLHUP)) { + ret |= (EV_READ | EV_WRITE); + } + + ret &= events; + + return ret; + } + + // timer processing + bool addTimerFd(); + void scheduleTimeout(); + void scheduleTimeout(const std::chrono::microseconds& us); + void addTimerEvent(Event& event, const struct timeval* timeout); + void removeTimerEvent(Event& event); + size_t processTimers(); + void setProcessTimers(); + + size_t processActiveEvents(); + + struct IoSqe; + + static void processPollIoSqe( + IoUringBackend* backend, IoSqe* ioSqe, int res, uint32_t flags); + static void processTimerIoSqe( + IoUringBackend* backend, + IoSqe* /*sqe*/, + int /*res*/, + uint32_t /* flags */); + static void processSignalReadIoSqe( + IoUringBackend* backend, + IoSqe* /*sqe*/, + int /*res*/, + uint32_t /* flags */); + + // signal handling + void addSignalEvent(Event& event); + void removeSignalEvent(Event& event); + bool addSignalFds(); + size_t processSignals(); + void setProcessSignals(); + + void processPollIo(IoSqe* ioSqe, int res, uint32_t flags) noexcept; + + IoSqe* FOLLY_NULLABLE allocIoSqe(const EventCallback& cb); + void releaseIoSqe(IoSqe* aioIoSqe) noexcept; + + // submit immediate if POLL_SQ | POLL_SQ_IMMEDIATE_IO flags are set + void submitImmediateIoSqe(IoSqeBase& ioSqe); + + void internalSubmit(IoSqeBase& ioSqe) noexcept; + + enum class InternalProcessCqeMode { + NORMAL, // process existing and any available + AVAILABLE_ONLY, // process existing but don't get more + CANCEL_ALL, // cancel every sqe + }; + unsigned int internalProcessCqe( + unsigned int maxGet, InternalProcessCqeMode mode) noexcept; + + int eb_event_modify_inserted(Event& event, IoSqe* ioSqe); + + struct FdRegistry { + FdRegistry() = delete; + FdRegistry(struct io_uring& ioRing, size_t n); + + IoUringFdRegistrationRecord* alloc(int fd) noexcept; + bool free(IoUringFdRegistrationRecord* record); + + int init(); + size_t update(); + + bool err_{false}; + struct io_uring& ioRing_; + std::vector files_; + size_t inUse_; + std::vector records_; + boost::intrusive:: + slist> + free_; + }; + + struct IoSqe : public IoSqeBase { + using BackendCb = void(IoUringBackend*, IoSqe*, int, uint32_t); + explicit IoSqe( + IoUringBackend* backend = nullptr, + bool poolAlloc = false, + bool persist = false) + : backend_(backend), poolAlloc_(poolAlloc), persist_(persist) {} + + void callback(const io_uring_cqe* cqe) noexcept override { + backendCb_(backend_, this, cqe->res, cqe->flags); + } + void callbackCancelled(const io_uring_cqe*) noexcept override { release(); } + virtual void release() noexcept; + + IoUringBackend* backend_; + BackendCb* backendCb_{nullptr}; + const bool poolAlloc_; + const bool persist_; + Event* event_{nullptr}; + IoUringFdRegistrationRecord* fdRecord_{nullptr}; + size_t useCount_{0}; + int res_; + uint32_t cqeFlags_; + + FOLLY_ALWAYS_INLINE void resetEvent() { + // remove it from the list + unlink(); + if (event_) { + event_->setUserData(nullptr); + event_ = nullptr; + } + } + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + auto* ev = event_->getEvent(); + if (ev) { + const auto& cb = event_->getCallback(); + switch (cb.type_) { + case EventCallback::Type::TYPE_NONE: + break; + case EventCallback::Type::TYPE_READ: + if (auto* iov = cb.readCb_->allocateData()) { + prepRead( + sqe, + ev->ev_fd, + &iov->data_, + 0, + (ev->ev_events & EV_PERSIST) != 0); + cbData_.set(iov); + return; + } + break; + case EventCallback::Type::TYPE_RECVMSG: + if (auto* msg = cb.recvmsgCb_->allocateData()) { + prepRecvmsg( + sqe, + ev->ev_fd, + &msg->data_, + (ev->ev_events & EV_PERSIST) != 0); + cbData_.set(msg); + return; + } + break; + case EventCallback::Type::TYPE_RECVMSG_MULTISHOT: + if (auto* hdr = + cb.recvmsgMultishotCb_->allocateRecvmsgMultishotData()) { + prepRecvmsgMultishot(sqe, ev->ev_fd, &hdr->data_); + cbData_.set(hdr); + return; + } + break; + } + prepPollAdd(sqe, ev->ev_fd, getPollFlags(ev->ev_events)); + } + } + + virtual void processActive() {} + + struct EventCallbackData { + EventCallback::Type type_{EventCallback::Type::TYPE_NONE}; + union { + EventReadCallback::IoVec* ioVec_; + EventRecvmsgCallback::MsgHdr* msgHdr_; + EventRecvmsgMultishotCallback::Hdr* hdr_; + }; + + void set(EventReadCallback::IoVec* ioVec) { + type_ = EventCallback::Type::TYPE_READ; + ioVec_ = ioVec; + } + + void set(EventRecvmsgCallback::MsgHdr* msgHdr) { + type_ = EventCallback::Type::TYPE_RECVMSG; + msgHdr_ = msgHdr; + } + + void set(EventRecvmsgMultishotCallback::Hdr* hdr) { + type_ = EventCallback::Type::TYPE_RECVMSG_MULTISHOT; + hdr_ = hdr; + } + + void reset() { type_ = EventCallback::Type::TYPE_NONE; } + + bool processCb(IoUringBackend* backend, int res, uint32_t flags) { + bool ret = false; + bool released = false; + switch (type_) { + case EventCallback::Type::TYPE_READ: { + released = ret = true; + auto cbFunc = ioVec_->cbFunc_; + cbFunc(ioVec_, res); + break; + } + case EventCallback::Type::TYPE_RECVMSG: { + released = ret = true; + auto cbFunc = msgHdr_->cbFunc_; + cbFunc(msgHdr_, res); + break; + } + case EventCallback::Type::TYPE_RECVMSG_MULTISHOT: { + ret = true; + std::unique_ptr buf; + if (flags & IORING_CQE_F_BUFFER) { + if (IoUringBufferProviderBase* bp = backend->bufferProvider()) { + buf = bp->getIoBuf(flags >> 16, res); + } + } + hdr_->cbFunc_(hdr_, res, std::move(buf)); + if (!(flags & IORING_CQE_F_MORE)) { + hdr_->freeFunc_(hdr_); + released = true; + } + break; + } + case EventCallback::Type::TYPE_NONE: + break; + } + + if (released) { + type_ = EventCallback::Type::TYPE_NONE; + } + + return ret; + } + + void releaseData() { + switch (type_) { + case EventCallback::Type::TYPE_READ: { + auto freeFunc = ioVec_->freeFunc_; + freeFunc(ioVec_); + break; + } + case EventCallback::Type::TYPE_RECVMSG: { + auto freeFunc = msgHdr_->freeFunc_; + freeFunc(msgHdr_); + break; + } + case EventCallback::Type::TYPE_RECVMSG_MULTISHOT: + hdr_->freeFunc_(hdr_); + break; + case EventCallback::Type::TYPE_NONE: + break; + } + type_ = EventCallback::Type::TYPE_NONE; + } + }; + + EventCallbackData cbData_; + + void prepPollAdd( + struct io_uring_sqe* sqe, int fd, uint32_t events) noexcept { + CHECK(sqe); + ::io_uring_prep_poll_add(sqe, fd, events); + ::io_uring_sqe_set_data(sqe, this); + } + + void prepRead( + struct io_uring_sqe* sqe, + int fd, + const struct iovec* iov, + off_t offset, + bool registerFd) noexcept { + prepUtilFunc( + ::io_uring_prep_read, + sqe, + registerFd, + fd, + iov->iov_base, + (unsigned int)iov->iov_len, + offset); + } + + void prepWrite( + struct io_uring_sqe* sqe, + int fd, + const struct iovec* iov, + off_t offset, + bool registerFd) noexcept { + prepUtilFunc( + ::io_uring_prep_write, + sqe, + registerFd, + fd, + iov->iov_base, + (unsigned int)iov->iov_len, + offset); + } + + void prepRecvmsg( + struct io_uring_sqe* sqe, + int fd, + struct msghdr* msg, + bool registerFd) noexcept { + prepUtilFunc( + ::io_uring_prep_recvmsg, sqe, registerFd, fd, msg, MSG_TRUNC); + } + + template + void prepUtilFunc( + Fn fn, + struct io_uring_sqe* sqe, + bool registerFd, + int fd, + Args... args) { + CHECK(sqe); + if (registerFd && !fdRecord_) { + fdRecord_ = backend_->registerFd(fd); + } + + if (fdRecord_) { + fn(sqe, fdRecord_->idx_, std::forward(args)...); + sqe->flags |= IOSQE_FIXED_FILE; + } else { + fn(sqe, fd, std::forward(args)...); + } + + ::io_uring_sqe_set_data(sqe, this); + } + + void prepRecvmsgMultishot( + struct io_uring_sqe* sqe, int fd, struct msghdr* msg) noexcept { + CHECK(sqe); + ::io_uring_prep_recvmsg(sqe, fd, msg, MSG_TRUNC); + // this magic value is set in io_uring_prep_recvmsg_multishot, + // however this version of the library isn't available widely yet + // so just hardcode it here + constexpr uint16_t kMultishotFlag = 1U << 1; + sqe->ioprio |= kMultishotFlag; + if (IoUringBufferProviderBase* bp = backend_->bufferProvider()) { + sqe->buf_group = bp->gid(); + sqe->flags |= IOSQE_BUFFER_SELECT; + } + ::io_uring_sqe_set_data(sqe, this); + } + + FOLLY_ALWAYS_INLINE void prepCancel( + struct io_uring_sqe* sqe, IoSqe* cancel_sqe) { + CHECK(sqe); + ::io_uring_prep_cancel(sqe, UserData{cancel_sqe}, 0); + ::io_uring_sqe_set_data(sqe, this); + } + }; + + using IoSqeBaseList = boost::intrusive:: + list>; + using IoSqeList = boost::intrusive:: + list>; + + struct FileOpIoSqe : public IoSqe { + FileOpIoSqe(IoUringBackend* backend, int fd, FileOpCallback&& cb) + : IoSqe(backend, false), fd_(fd), cb_(std::move(cb)) {} + + void processActive() override { cb_(res_); } + + int fd_{-1}; + + FileOpCallback cb_; + }; + + struct ReadWriteIoSqe : public FileOpIoSqe { + ReadWriteIoSqe( + IoUringBackend* backend, + int fd, + const struct iovec* iov, + off_t offset, + FileOpCallback&& cb) + : FileOpIoSqe(backend, fd, std::move(cb)), + iov_(iov, iov + 1), + offset_(offset) {} + + ReadWriteIoSqe( + IoUringBackend* backend, + int fd, + Range iov, + off_t offset, + FileOpCallback&& cb) + : FileOpIoSqe(backend, fd, std::move(cb)), iov_(iov), offset_(offset) {} + + void processActive() override { cb_(res_); } + + static constexpr size_t kNumInlineIoVec = 4; + folly::small_vector iov_; + off_t offset_; + }; + + struct ReadIoSqe : public ReadWriteIoSqe { + using ReadWriteIoSqe::ReadWriteIoSqe; + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + prepRead(sqe, fd_, iov_.data(), offset_, false); + } + }; + + struct WriteIoSqe : public ReadWriteIoSqe { + using ReadWriteIoSqe::ReadWriteIoSqe; + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + prepWrite(sqe, fd_, iov_.data(), offset_, false); + } + }; + + struct ReadvIoSqe : public ReadWriteIoSqe { + using ReadWriteIoSqe::ReadWriteIoSqe; + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + ::io_uring_prep_readv( + sqe, fd_, iov_.data(), (unsigned int)iov_.size(), offset_); + ::io_uring_sqe_set_data(sqe, this); + } + }; + + struct WritevIoSqe : public ReadWriteIoSqe { + using ReadWriteIoSqe::ReadWriteIoSqe; + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + ::io_uring_prep_writev( + sqe, fd_, iov_.data(), (unsigned int)iov_.size(), offset_); + ::io_uring_sqe_set_data(sqe, this); + } + }; + + enum class FSyncFlags { + FLAGS_FSYNC = 0, + FLAGS_FDATASYNC = 1, + }; + + struct FSyncIoSqe : public FileOpIoSqe { + FSyncIoSqe( + IoUringBackend* backend, int fd, FSyncFlags flags, FileOpCallback&& cb) + : FileOpIoSqe(backend, fd, std::move(cb)), flags_(flags) {} + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + unsigned int fsyncFlags = 0; + switch (flags_) { + case FSyncFlags::FLAGS_FSYNC: + fsyncFlags = 0; + break; + case FSyncFlags::FLAGS_FDATASYNC: + fsyncFlags = IORING_FSYNC_DATASYNC; + break; + } + + ::io_uring_prep_fsync(sqe, fd_, fsyncFlags); + ::io_uring_sqe_set_data(sqe, this); + } + + FSyncFlags flags_; + }; + + struct FOpenAtIoSqe : public FileOpIoSqe { + FOpenAtIoSqe( + IoUringBackend* backend, + int dfd, + const char* path, + int flags, + mode_t mode, + FileOpCallback&& cb) + : FileOpIoSqe(backend, dfd, std::move(cb)), + path_(path), + flags_(flags), + mode_(mode) {} + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + ::io_uring_prep_openat(sqe, fd_, path_.c_str(), flags_, mode_); + ::io_uring_sqe_set_data(sqe, this); + } + + std::string path_; + int flags_; + mode_t mode_; + }; + + struct FOpenAt2IoSqe : public FileOpIoSqe { + FOpenAt2IoSqe( + IoUringBackend* backend, + int dfd, + const char* path, + struct open_how* how, + FileOpCallback&& cb) + : FileOpIoSqe(backend, dfd, std::move(cb)), path_(path), how_(*how) {} + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + ::io_uring_prep_openat2(sqe, fd_, path_.c_str(), &how_); + ::io_uring_sqe_set_data(sqe, this); + } + + std::string path_; + struct open_how how_; + }; + + struct FCloseIoSqe : public FileOpIoSqe { + using FileOpIoSqe::FileOpIoSqe; + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + ::io_uring_prep_close(sqe, fd_); + ::io_uring_sqe_set_data(sqe, this); + } + }; + + struct FStatxIoSqe : public FileOpIoSqe { + FStatxIoSqe( + IoUringBackend* backend, + int dfd, + const char* pathname, + int flags, + unsigned int mask, + struct statx* statxbuf, + FileOpCallback&& cb) + : FileOpIoSqe(backend, dfd, std::move(cb)), + path_(pathname), + flags_(flags), + mask_(mask), + statxbuf_(statxbuf) {} + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + ::io_uring_prep_statx(sqe, fd_, path_, flags_, mask_, statxbuf_); + ::io_uring_sqe_set_data(sqe, this); + } + + const char* path_; + int flags_; + unsigned int mask_; + struct statx* statxbuf_; + }; + + struct FAllocateIoSqe : public FileOpIoSqe { + FAllocateIoSqe( + IoUringBackend* backend, + int fd, + int mode, + off_t offset, + off_t len, + FileOpCallback&& cb) + : FileOpIoSqe(backend, fd, std::move(cb)), + mode_(mode), + offset_(offset), + len_(len) {} + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + ::io_uring_prep_fallocate(sqe, fd_, mode_, offset_, len_); + ::io_uring_sqe_set_data(sqe, this); + } + + int mode_; + off_t offset_; + off_t len_; + }; + + struct SendmsgIoSqe : public FileOpIoSqe { + SendmsgIoSqe( + IoUringBackend* backend, + int fd, + const struct msghdr* msg, + unsigned int flags, + FileOpCallback&& cb) + : FileOpIoSqe(backend, fd, std::move(cb)), msg_(msg), flags_(flags) {} + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + ::io_uring_prep_sendmsg(sqe, fd_, msg_, flags_); + ::io_uring_sqe_set_data(sqe, this); + } + + const struct msghdr* msg_; + unsigned int flags_; + }; + + struct RecvmsgIoSqe : public FileOpIoSqe { + RecvmsgIoSqe( + IoUringBackend* backend, + int fd, + struct msghdr* msg, + unsigned int flags, + FileOpCallback&& cb) + : FileOpIoSqe(backend, fd, std::move(cb)), msg_(msg), flags_(flags) {} + + void processSubmit(struct io_uring_sqe* sqe) noexcept override { + ::io_uring_prep_recvmsg(sqe, fd_, msg_, flags_); + ::io_uring_sqe_set_data(sqe, this); + } + + struct msghdr* msg_; + unsigned int flags_; + }; + + size_t getActiveEvents(WaitForEventsMode waitForEvents); + size_t prepList(IoSqeBaseList& ioSqes); + int submitOne(); + int cancelOne(IoSqe* ioSqe); + + int submitBusyCheck(int num, WaitForEventsMode waitForEvents) noexcept; + int submitEager(); + + void queueFsync(int fd, FSyncFlags flags, FileOpCallback&& cb); + + void processFileOp(IoSqe* ioSqe, int res) noexcept; + + static void processFileOpCB( + IoUringBackend* backend, IoSqe* ioSqe, int res, uint32_t) { + static_cast(backend)->processFileOp(ioSqe, res); + } + + IoUringBackend::IoSqe* allocNewIoSqe(const EventCallback& /*cb*/) { + // allow pool alloc if numPooledIoSqeInUse_ < numEntries_ + auto* ret = new IoSqe(this, numPooledIoSqeInUse_ < numEntries_); + ++numPooledIoSqeInUse_; + ret->backendCb_ = IoUringBackend::processPollIoSqe; + + return ret; + } + + void cleanup(); + + struct io_uring_sqe* getUntrackedSqe(); + struct io_uring_sqe* getSqe(); + + /// some ring calls require being called on a single system thread, so we need + /// to delay init of those things until the correct thread is ready + void delayedInit(); + + /// init things that are linked to the io_uring submitter concept + /// so for DeferTaskrun, only do this in delayed init + void initSubmissionLinked(); + + Options options_; + size_t numEntries_; + std::unique_ptr timerEntry_; + std::unique_ptr signalReadEntry_; + IoSqeList freeList_; + bool usingDeferTaskrun_{false}; + + // timer related + int timerFd_{-1}; + bool timerChanged_{false}; + bool timerSet_{false}; + std::multimap timers_; + + // signal related + SocketPair signalFds_; + std::map> signals_; + + // submit + IoSqeBaseList submitList_; + uint16_t bufferProviderGidNext_{0}; + IoUringBufferProviderBase::UniquePtr bufferProvider_; + + // loop related + bool loopBreak_{false}; + bool shuttingDown_{false}; + bool processTimers_{false}; + bool processSignals_{false}; + IoSqeList activeEvents_; + size_t waitingToSubmit_{0}; + size_t numInsertedEvents_{0}; + size_t numInternalEvents_{0}; + size_t numSendEvents_{0}; + + // number of pooled IoSqe instances in use + size_t numPooledIoSqeInUse_{0}; + + // io_uring related + struct io_uring_params params_; + struct io_uring ioRing_; + + FdRegistry fdRegistry_; + + // poll callback to be invoked if POLL_CQ flag is set + // every time we poll for a CQE + CQPollLoopCallback cqPollLoopCallback_; + + bool needsDelayedInit_{true}; + + // stuff for ensuring we don't re-enter submit/getActiveEvents + folly::Optional submitTid_; + int isSubmitting_{0}; + bool gettingEvents_{false}; + void dCheckSubmitTid(); + void setSubmitting() noexcept { isSubmitting_++; } + void doneSubmitting() noexcept { isSubmitting_--; } + void setGetActiveEvents() { + if (kIsDebug && gettingEvents_) { + throw std::runtime_error("getting events is not reentrant"); + gettingEvents_ = true; + } + } + void doneGetActiveEvents() noexcept { gettingEvents_ = false; } + bool isSubmitting() const noexcept { return isSubmitting_; } +}; + +using PollIoBackend = IoUringBackend; +} // namespace folly + +#endif diff --git a/folly/io/async/IoUringBase.h b/folly/io/async/IoUringBase.h new file mode 100644 index 00000000000..0e46e367461 --- /dev/null +++ b/folly/io/async/IoUringBase.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +struct io_uring_sqe; +struct io_uring_cqe; + +namespace folly { + +class IoUringBackend; + +struct IoSqeBase + : boost::intrusive::list_base_hook< + boost::intrusive::link_mode> { + enum class Type { + Unknown, + Read, + Write, + Open, + Close, + Connect, + Cancel, + }; + + IoSqeBase() : IoSqeBase(Type::Unknown) {} + explicit IoSqeBase(Type type) : type_(type) {} + // use raw addresses, so disallow copy/move + IoSqeBase(IoSqeBase&&) = delete; + IoSqeBase(const IoSqeBase&) = delete; + IoSqeBase& operator=(IoSqeBase&&) = delete; + IoSqeBase& operator=(const IoSqeBase&) = delete; + + virtual ~IoSqeBase() = default; + virtual void processSubmit(struct io_uring_sqe* sqe) noexcept = 0; + virtual void callback(const io_uring_cqe* cqe) noexcept = 0; + virtual void callbackCancelled(const io_uring_cqe* cqe) noexcept = 0; + IoSqeBase::Type type() const { return type_; } + bool inFlight() const { return inFlight_; } + bool cancelled() const { return cancelled_; } + void markCancelled() { cancelled_ = true; } + + protected: + // This is used if you want to prepare this sqe for reuse, but will manage the + // lifetime. For example for zerocopy send, you might want to reuse the sqe + // but still have a notification inbound. + void prepareForReuse() { internalUnmarkInflight(); } + + private: + friend class IoUringBackend; + void internalSubmit(struct io_uring_sqe* sqe) noexcept; + void internalCallback(const io_uring_cqe* cqe) noexcept; + void internalUnmarkInflight() { inFlight_ = false; } + + bool inFlight_ = false; + bool cancelled_ = false; + Type type_; +}; + +class IoUringBufferProviderBase { + protected: + uint16_t const gid_; + size_t const sizePerBuffer_; + + public: + struct Deleter { + void operator()(IoUringBufferProviderBase* base) { + if (base) { + base->destroy(); + } + } + }; + + using UniquePtr = std::unique_ptr; + explicit IoUringBufferProviderBase(uint16_t gid, size_t sizePerBuffer) + : gid_(gid), sizePerBuffer_(sizePerBuffer) {} + virtual ~IoUringBufferProviderBase() = default; + + IoUringBufferProviderBase(IoUringBufferProviderBase&&) = delete; + IoUringBufferProviderBase(IoUringBufferProviderBase const&) = delete; + IoUringBufferProviderBase& operator=(IoUringBufferProviderBase&&) = delete; + IoUringBufferProviderBase& operator=(IoUringBufferProviderBase const&) = + delete; + + size_t sizePerBuffer() const { return sizePerBuffer_; } + uint16_t gid() const { return gid_; } + + virtual uint32_t count() const noexcept = 0; + virtual void unusedBuf(uint16_t i) noexcept = 0; + virtual std::unique_ptr getIoBuf( + uint16_t i, size_t length) noexcept = 0; + virtual void enobuf() noexcept = 0; + virtual bool available() const noexcept = 0; + virtual void destroy() noexcept = 0; +}; + +struct IoUringFdRegistrationRecord : public boost::intrusive::slist_base_hook< + boost::intrusive::cache_last> { + int count_{0}; + int fd_{-1}; + int idx_{0}; +}; + +} // namespace folly diff --git a/folly/experimental/io/IoUringEvent.cpp b/folly/io/async/IoUringEvent.cpp similarity index 98% rename from folly/experimental/io/IoUringEvent.cpp rename to folly/io/async/IoUringEvent.cpp index 0260ee705db..ea90a07071c 100644 --- a/folly/experimental/io/IoUringEvent.cpp +++ b/folly/io/async/IoUringEvent.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #if FOLLY_HAS_LIBURING diff --git a/folly/io/async/IoUringEvent.h b/folly/io/async/IoUringEvent.h new file mode 100644 index 00000000000..c048aaffb6c --- /dev/null +++ b/folly/io/async/IoUringEvent.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace folly { + +#if FOLLY_HAS_LIBURING + +class IoUringEvent : public EventHandler, public EventBase::LoopCallback { + public: + IoUringEvent( + folly::EventBase* eventBase, + IoUringBackend::Options const& o, + bool use_event_fd = true); + ~IoUringEvent() override; + + // cannot move/copy due to postLoopCallback + IoUringEvent const& operator=(IoUringEvent const&) = delete; + IoUringEvent&& operator=(IoUringEvent&&) = delete; + IoUringEvent(IoUringEvent&&) = delete; + IoUringEvent(IoUringEvent const&) = delete; + + void handlerReady(uint16_t events) noexcept override; + + void runLoopCallback() noexcept override; + + IoUringBackend& backend() { return backend_; } + + private: + bool hasWork(); + EventBase* eventBase_; + IoUringBackend backend_; + + bool lastWasResignalled_ = false; + bool edgeTriggered_ = false; + std::optional eventFd_; +}; + +#endif + +} // namespace folly diff --git a/folly/experimental/io/IoUringEventBaseLocal.cpp b/folly/io/async/IoUringEventBaseLocal.cpp similarity index 97% rename from folly/experimental/io/IoUringEventBaseLocal.cpp rename to folly/io/async/IoUringEventBaseLocal.cpp index 872c3934bea..bfbb00bba0d 100644 --- a/folly/experimental/io/IoUringEventBaseLocal.cpp +++ b/folly/io/async/IoUringEventBaseLocal.cpp @@ -16,8 +16,8 @@ #include #include -#include #include +#include #if FOLLY_HAS_LIBURING diff --git a/folly/io/async/IoUringEventBaseLocal.h b/folly/io/async/IoUringEventBaseLocal.h new file mode 100644 index 00000000000..745d9149aa1 --- /dev/null +++ b/folly/io/async/IoUringEventBaseLocal.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace folly { + +#if FOLLY_HAS_LIBURING + +class IoUringEventBaseLocal { + public: + static void attach( + EventBase* evb, + IoUringBackend::Options const& options, + bool use_eventfd = true); + static IoUringBackend* try_get(EventBase* evb); +}; + +#endif + +} // namespace folly diff --git a/folly/experimental/io/IoUringProvidedBufferRing.cpp b/folly/io/async/IoUringProvidedBufferRing.cpp similarity index 99% rename from folly/experimental/io/IoUringProvidedBufferRing.cpp rename to folly/io/async/IoUringProvidedBufferRing.cpp index 49ea3e0b82a..a76224468aa 100644 --- a/folly/experimental/io/IoUringProvidedBufferRing.cpp +++ b/folly/io/async/IoUringProvidedBufferRing.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include diff --git a/folly/io/async/IoUringProvidedBufferRing.h b/folly/io/async/IoUringProvidedBufferRing.h new file mode 100644 index 00000000000..fdcb6c10bd0 --- /dev/null +++ b/folly/io/async/IoUringProvidedBufferRing.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#if FOLLY_HAS_LIBURING + +#include // @manual + +namespace folly { + +class IoUringProvidedBufferRing : public IoUringBufferProviderBase { + public: + class LibUringCallError : public std::runtime_error { + public: + using std::runtime_error::runtime_error; + }; + + IoUringProvidedBufferRing( + io_uring* ioRingPtr, + uint16_t gid, + int count, + int bufferShift, + int ringSizeShift); + + void enobuf() noexcept override; + void unusedBuf(uint16_t i) noexcept override; + void destroy() noexcept override; + std::unique_ptr getIoBuf(uint16_t i, size_t length) noexcept override; + + uint32_t count() const noexcept override { return buffer_.bufferCount(); } + bool available() const noexcept override { + return !enobuf_.load(std::memory_order_relaxed); + } + + private: + void initialRegister(); + void returnBufferInShutdown() noexcept; + void returnBuffer(uint16_t i) noexcept; + + std::atomic* sharedTail() { + return reinterpret_cast*>(&buffer_.ring()->tail); + } + + bool tryPublish(uint16_t expected, uint16_t value) noexcept { + return sharedTail()->compare_exchange_strong( + expected, value, std::memory_order_release); + } + + char const* getData(uint16_t i) { return buffer_.buffer(i); } + + class ProvidedBuffersBuffer { + public: + ProvidedBuffersBuffer( + int count, int bufferShift, int ringCountShift, bool huge_pages); + ~ProvidedBuffersBuffer() { ::munmap(buffer_, allSize_); } + + static size_t calcBufferSize(int bufferShift) { + return 1LLU << std::max(5, bufferShift); + } + + struct io_uring_buf_ring* ring() const noexcept { return ringPtr_; } + + struct io_uring_buf* ringBuf(int idx) const noexcept { + return &ringPtr_->bufs[idx & ringMask_]; + } + + uint32_t bufferCount() const noexcept { return bufferCount_; } + uint32_t ringCount() const noexcept { return 1 + ringMask_; } + + char* buffer(uint16_t idx) { + size_t offset = (size_t)idx << bufferShift_; + return bufferBuffer_ + offset; + } + + size_t sizePerBuffer() const { return sizePerBuffer_; } + + private: + void* buffer_; + size_t allSize_; + + size_t ringMemSize_; + struct io_uring_buf_ring* ringPtr_; + int ringMask_; + + size_t bufferSize_; + size_t bufferShift_; + size_t sizePerBuffer_; + char* bufferBuffer_; + uint32_t bufferCount_; + + // static constexpr + static constexpr size_t kHugePageMask = (1LLU << 21) - 1; // 2MB + static constexpr size_t kPageMask = (1LLU << 12) - 1; // 4095 + static constexpr size_t kBufferAlignMask{31LLU}; + }; + + io_uring* ioRingPtr_; + ProvidedBuffersBuffer buffer_; + std::atomic enobuf_{false}; + std::vector ioBufCallbacks_; + + uint64_t gottenBuffers_{0}; + std::atomic returnedBuffers_{0}; + + std::atomic wantsShutdown_{false}; + std::atomic shutdownReferences_; + std::mutex shutdownMutex_; +}; + +} // namespace folly + +#endif diff --git a/folly/io/async/Liburing.h b/folly/io/async/Liburing.h new file mode 100644 index 00000000000..8e81aaae9ca --- /dev/null +++ b/folly/io/async/Liburing.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#if defined(__linux__) && __has_include() +#define FOLLY_HAS_LIBURING 1 +#else +#define FOLLY_HAS_LIBURING 0 +#endif diff --git a/folly/experimental/io/MuxIOThreadPoolExecutor.cpp b/folly/io/async/MuxIOThreadPoolExecutor.cpp similarity index 99% rename from folly/experimental/io/MuxIOThreadPoolExecutor.cpp rename to folly/io/async/MuxIOThreadPoolExecutor.cpp index f00c93e2554..fdb2f5c20ca 100644 --- a/folly/experimental/io/MuxIOThreadPoolExecutor.cpp +++ b/folly/io/async/MuxIOThreadPoolExecutor.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include diff --git a/folly/io/async/MuxIOThreadPoolExecutor.h b/folly/io/async/MuxIOThreadPoolExecutor.h new file mode 100644 index 00000000000..cfe25983eb5 --- /dev/null +++ b/folly/io/async/MuxIOThreadPoolExecutor.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace folly { + +/** + * NOTE: This is highly experimental. Do not use. + * + * A pool of EventBases scheduled over a pool of threads. + * + * Intended as a drop-in replacement for folly::IOThreadPoolExecutor, but with a + * substantially different design: EventBases are not pinned to threads, so it + * is possible to have more EventBases than threads. EventBases that have ready + * events can be scheduled on any of the threads in the pool, with the + * scheduling governed by ThrottledLifoSem. + * + * This allows to batch the loops of multiple EventBases on a single thread as + * long as each runs for a short enough time, reducing the number of wake-ups + * and allowing for better load balancing across handlers. For example, we can + * create a large number of EventBases processed by a smaller number of threads + * and distribute the handlers. + * + * The number of EventBases is set at construction time and cannot be changed + * later. The number of threads can be changed dynamically, but setting it to 0 + * is not supported (otherwise no thread would be left to drive the EventBases) + * and it is not useful to run more threads than EventBases, so that is not + * supported either: attempting to set the number of threads to 0 or to a value + * greater than numEventBases() (either in construction or using + * setNumThreads()) will throw std::invalid_argument). + */ +class MuxIOThreadPoolExecutor : public IOThreadPoolExecutorBase { + public: + struct Options { + Options() {} + + Options& setEnableThreadIdCollection(bool b) { + enableThreadIdCollection = b; + return *this; + } + + Options& setNumEventBases(size_t num) { + numEventBases = num; + return *this; + } + + Options& setWakeUpInterval(std::chrono::nanoseconds w) { + wakeUpInterval = w; + return *this; + } + + Options& setIdleSpinMax(std::chrono::nanoseconds s) { + idleSpinMax = s; + return *this; + } + + bool enableThreadIdCollection{false}; + // If 0, the number of EventBases is set to the number of threads. + size_t numEventBases{0}; + std::chrono::nanoseconds wakeUpInterval{std::chrono::microseconds{100}}; + // Max spin for an idle thread waiting for work before going to sleep. + std::chrono::nanoseconds idleSpinMax = std::chrono::microseconds{10}; + }; + + explicit MuxIOThreadPoolExecutor( + size_t numThreads, + Options options = {}, + std::shared_ptr threadFactory = + std::make_shared("MuxIOTPEx"), + folly::EventBaseManager* ebm = folly::EventBaseManager::get()); + + ~MuxIOThreadPoolExecutor() override; + + size_t numEventBases() const { return numEventBases_; } + + void add(Func func) override; + void add( + Func func, + std::chrono::milliseconds expiration, + Func expireCallback = nullptr) override; + + folly::EventBase* getEventBase() override; + + // Returns all the EventBase instances + std::vector> getAllEventBases() + override; + + folly::EventBaseManager* getEventBaseManager() override; + + // Returns nullptr unless explicitly enabled through constructor + folly::WorkerProvider* getThreadIdCollector() override { + return threadIdCollector_.get(); + } + + void addObserver(std::shared_ptr o) override; + void removeObserver(std::shared_ptr o) override; + + void stop() override; + void join() override; + + private: + using EventBasePoller = folly::detail::EventBasePoller; + + struct EvbState; + + struct alignas(Thread) IOThread : public Thread { + EvbState* curEvbState; // Only accessed inside the worker thread. + }; + + void maybeUnregisterEventBases(Observer* o); + + void validateNumThreads(size_t numThreads) override; + ThreadPtr makeThread() override; + EvbState& pickEvbState(); + void threadRun(ThreadPtr thread) override; + void stopThreads(size_t n) override; + size_t getPendingTaskCountImpl() const override final; + + const Options options_; + const size_t numEventBases_; + folly::EventBaseManager* eventBaseManager_; + + std::unique_ptr fdGroup_; + std::vector> evbStates_; + std::vector> keepAlives_; + + relaxed_atomic nextEvb_{0}; + folly::ThreadLocal> thisThread_; + std::unique_ptr threadIdCollector_; + std::atomic pendingTasks_{0}; + + USPMCQueue readyQueue_; + folly::ThrottledLifoSem readyQueueSem_; +}; + +} // namespace folly diff --git a/folly/experimental/io/SimpleAsyncIO.cpp b/folly/io/async/SimpleAsyncIO.cpp similarity index 99% rename from folly/experimental/io/SimpleAsyncIO.cpp rename to folly/io/async/SimpleAsyncIO.cpp index 5d586efc350..a8a41bc7e0b 100644 --- a/folly/experimental/io/SimpleAsyncIO.cpp +++ b/folly/io/async/SimpleAsyncIO.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include diff --git a/folly/io/async/SimpleAsyncIO.h b/folly/io/async/SimpleAsyncIO.h new file mode 100644 index 00000000000..fd7310016c2 --- /dev/null +++ b/folly/io/async/SimpleAsyncIO.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +namespace folly { + +/** + * SimpleAsyncIO is a wrapper around AsyncIO intended to hide all the details. + * + * Usage: just create an instance of SimpleAsyncIO and then issue IO with + * pread and pwrite, no other effort required. e.g.: + * + * + * auto tmpfile = folly::File::temporary(); + * folly::SimpleAsyncIO aio; + * aio.pwrite( + * tmpfile.fd(), + * "hello world", + * 11, // size + * 0, // offset + * [](int rc) { LOG(INFO) << "Write completed with rc " << rc; }); + * + * + * IO is dispatched in the context of the calling thread; it may block briefly + * to obtain a lock on shared resources, but will *not* block for IO + * completion. If the IO queue is full (see setMaxRequests(size_t) in Config), + * IO fails with -EBUSY. + * + * IO is completed on the executor specified in the config (global CPU + * executor by default). + * + * IO is completed by calling the callback function provided to pread/pwrite. + * The single parameter to the callback is either a negative errno or the + * number of bytes transferred. + * + * There is a "hidden" EventBase which polls for IO completion and dispatches + * completion events to the executor. You may specify an existing EventBase in + * the config (and you are then responsible for making sure the EventBase + * instance outlives the SimpleAsyncIO instance). If you do not specify one, a + * ScopedEventBaseThread instance will be created. + * + * Following structure defines the configuration of a SimpleAsyncIO instance, + * in case you need to override the (sensible) defaults. + * + * Typical usage is something like: + * + * SimpleAsyncIO io(SimpleAsyncIO::Config() + * .setMaxRequests(100) + * .setMode(SimpleAsyncIO::Mode::IOURING)); + */ +class SimpleAsyncIO : public EventHandler { + public: + /** + * The asynchronized backend to be used: libaio or liburing + */ + enum Mode { + /// use libaio + AIO, + /// use liburing + IOURING + }; + /** + * The Config for SimpleAsyncIO on: + * - choosing backend implementation + * - executor to use for receiving completion + * - max requests are allowed + */ + struct Config { + Config() + : maxRequests_(1000), + completionExecutor_( + getKeepAliveToken(getUnsafeMutableGlobalCPUExecutor().get())), + mode_(AIO), + evb_(nullptr) {} + /// Maximum requests can be queued; -EBUSY returned for requests above + /// threshold + Config& setMaxRequests(size_t maxRequests) { + maxRequests_ = maxRequests; + return *this; + } + Config& setCompletionExecutor(Executor::KeepAlive<> completionExecutor) { + completionExecutor_ = completionExecutor; + return *this; + } + Config& setMode(Mode mode) { + mode_ = mode; + return *this; + } + Config& setEventBase(EventBase* evb) { + evb_ = evb; + return *this; + } + + private: + size_t maxRequests_; + Executor::KeepAlive<> completionExecutor_; + Mode mode_; + EventBase* evb_; + + friend class SimpleAsyncIO; + }; + + explicit SimpleAsyncIO(Config cfg = Config()); + virtual ~SimpleAsyncIO() override; + + using SimpleAsyncIOCompletor = Function; + + /** + * Initiate an asynchronous read request. + * + * Parameters and return value are same as pread(2). + * + * Completion is indicated by an asynchronous call to the given completor + * callback. The sole parameter to the callback is the result of the + * operation. + * + * @returns Same as pread(2) and if requests number reaches maxRequests_, + * return -EBUSY + */ + void pread( + int fd, + void* buf, + size_t size, + off_t start, + SimpleAsyncIOCompletor completor); + + /** + * Initiate an asynchronous write request. + * + * Parameters and return value are same as pwrite(2). + * + * Completion is indicated by an asynchronous call to the given completor + * callback. The sole parameter to the callback is the result of the + * operation. + * + * @returns Same as pwrite(2) and if requests number reaches maxRequests_, + * return -EBUSY + */ + void pwrite( + int fd, + const void* data, + size_t size, + off_t offset, + SimpleAsyncIOCompletor completor); + +#if FOLLY_HAS_COROUTINES + /** + * Coroutine version of pread(). + * + * Identical to pread() except that result is obtained by co_await instead of + * callback. + * + * @returns Same as pread(2) and if requests number reaches maxRequests_, + * return -EBUSY + */ + folly::coro::Task co_pread(int fd, void* buf, size_t size, off_t start); + /** + * Coroutine version of pwrite(). + * + * Identical to pwrite() except that result is obtained by co_await instead of + * callback. + * + * @returns Same as pwrite(2) and if requests number reaches maxRequests_, + * return -EBUSY + */ + folly::coro::Task co_pwrite( + int fd, const void* buf, size_t size, off_t start); +#endif + + private: + std::unique_ptr getOp(); + void putOp(std::unique_ptr&&); + + void submitOp( + Function preparer, SimpleAsyncIOCompletor completor); + + virtual void handlerReady(uint16_t events) noexcept override; + + template + void init(); + + size_t maxRequests_; + Executor::KeepAlive<> completionExecutor_; + std::unique_ptr asyncIO_; + Synchronized>> opsFreeList_; + std::unique_ptr evb_; + bool terminating_; + Baton<> drainedBaton_; +}; + +} // namespace folly diff --git a/folly/experimental/io/test/AsyncIOTest.cpp b/folly/io/async/test/AsyncIOTest.cpp similarity index 100% rename from folly/experimental/io/test/AsyncIOTest.cpp rename to folly/io/async/test/AsyncIOTest.cpp diff --git a/folly/experimental/io/test/AsyncIoUringSocketTest.cpp b/folly/io/async/test/AsyncIoUringSocketTest.cpp similarity index 100% rename from folly/experimental/io/test/AsyncIoUringSocketTest.cpp rename to folly/io/async/test/AsyncIoUringSocketTest.cpp diff --git a/folly/io/async/test/BUCK b/folly/io/async/test/BUCK index 339f43fba33..0b790dd1034 100644 --- a/folly/io/async/test/BUCK +++ b/folly/io/async/test/BUCK @@ -938,3 +938,179 @@ cpp_binary( "//folly/io/async/test:util", ], ) + +cpp_unittest( + name = "async_io_test", + srcs = ["AsyncIOTest.cpp"], + supports_static_listing = False, + deps = [ + "//folly/experimental/io:async_io", + "//folly/experimental/io/test:async_base_test_lib", + ], +) + +cpp_unittest( + name = "async_io_uring_socket_test", + srcs = ["AsyncIoUringSocketTest.cpp"], + supports_static_listing = False, + deps = [ + "//folly:file_util", + "//folly:subprocess", + "//folly/executors:global_executor", + "//folly/experimental/io:async_io_uring_socket", + "//folly/experimental/io:io_uring_backend", + "//folly/experimental/io:io_uring_event", + "//folly/futures:core", + "//folly/io/async:async_base", + "//folly/io/async:async_socket", + "//folly/io/async:server_socket", + "//folly/portability:gtest", + "//folly/system:shell", + "//folly/test:socket_address_test_helper", + ], +) + +cpp_unittest( + name = "epoll_backend_test", + srcs = ["EpollBackendTest.cpp"], + owner = "dmm@xmail.facebook.com", + supports_static_listing = False, + deps = [ + "//folly/experimental/io:epoll_backend", + "//folly/io/async/test:async_signal_handler_test_lib", + "//folly/io/async/test:event_base_test_lib", + ], +) + +cpp_binary( + name = "io_benchmark", + srcs = ["IOBenchmark.cpp"], + headers = [], + deps = [ + "//folly:benchmark", + "//folly:file_util", + "//folly/experimental/io:async_io", + "//folly/experimental/io:io_uring", + "//folly/experimental/io/test:async_base_test_lib", + "//folly/experimental/io/test:io_test_temp_file_util_lib", + "//folly/portability:gflags", + ], +) + +cpp_binary( + name = "io_uring_backend_bench", + srcs = ["IoUringBackendBench.cpp"], + headers = [], + deps = [ + "//folly:benchmark", + "//folly:file_util", + "//folly/experimental/io:epoll_backend", + "//folly/experimental/io:io_uring_backend", + "//folly/init:init", + "//folly/io/async:async_base", + "//folly/io/async:scoped_event_base_thread", + "//folly/portability:gflags", + ], +) + +cpp_unittest( + name = "io_uring_backend_setup_test", + srcs = ["IoUringBackendSetupTest.cpp"], + owner = "kvigor@xmail.facebook.com", + deps = [ + "//folly/experimental/io:io_uring_backend", + "//folly/portability:gtest", + ], +) + +cpp_unittest( + name = "io_uring_backend_test", + srcs = ["IoUringBackendTest.cpp"], + headers = [], + owner = "dmm@xmail.facebook.com", + supports_static_listing = False, + deps = [ + "//folly:file_util", + "//folly:function", + "//folly:string", + "//folly/experimental/io:io_uring_backend", + "//folly/experimental/io/test:io_test_temp_file_util_lib", + "//folly/init:init", + "//folly/io/async:async_base", + "//folly/io/async:async_udp_server_socket", + "//folly/io/async:async_udp_socket", + "//folly/io/async/test:async_signal_handler_test_lib", + "//folly/io/async/test:event_base_test_lib", + "//folly/portability:gtest", + ], +) + +cpp_unittest( + name = "io_uring_event_base_local_test", + srcs = ["IoUringEventBaseLocalTest.cpp"], + owner = "dylany@xmail.facebook.com", + deps = [ + "//folly/experimental/io:io_uring_backend", + "//folly/experimental/io:io_uring_event_base_local", + "//folly/futures:core", + "//folly/portability:gtest", + ], +) + +cpp_unittest( + name = "io_uring_event_test", + srcs = ["IoUringEventTest.cpp"], + owner = "dylany@xmail.facebook.com", + supports_static_listing = False, + deps = [ + "//folly/experimental/io:io_uring_backend", + "//folly/experimental/io:io_uring_event", + "//folly/futures:core", + "//folly/io/async:async_base", + "//folly/portability:gtest", + ], +) + +cpp_unittest( + name = "io_uring_test", + srcs = ["IoUringTest.cpp"], + owner = "dmm@xmail.facebook.com", + supports_static_listing = False, + deps = [ + "//folly/experimental/io:io_uring", + "//folly/experimental/io/test:async_base_test_lib", + "//folly/init:init", + ], +) + +cpp_binary( + name = "registered_fd_benchmark", + srcs = ["RegisteredFdBenchmark.cpp"], + headers = [], + deps = [ + "//folly:benchmark", + "//folly:file_util", + "//folly/experimental/io:io_uring_backend", + "//folly/io/async:async_base", + "//folly/portability:gflags", + ], +) + +cpp_unittest( + name = "simple_async_io_test", + srcs = ["SimpleAsyncIOTest.cpp"], + supports_static_listing = False, + deps = [ + "//folly:file", + "//folly:random", + "//folly/experimental/coro:blocking_wait", + "//folly/experimental/coro:collect", + "//folly/experimental/io:simple_async_io", + "//folly/io:iobuf", + "//folly/portability:gtest", + "//folly/synchronization:baton", + ], + external_deps = [ + "glog", + ], +) diff --git a/folly/experimental/io/test/EpollBackendTest.cpp b/folly/io/async/test/EpollBackendTest.cpp similarity index 100% rename from folly/experimental/io/test/EpollBackendTest.cpp rename to folly/io/async/test/EpollBackendTest.cpp diff --git a/folly/experimental/io/test/IOBenchmark.cpp b/folly/io/async/test/IOBenchmark.cpp similarity index 100% rename from folly/experimental/io/test/IOBenchmark.cpp rename to folly/io/async/test/IOBenchmark.cpp diff --git a/folly/experimental/io/test/IoUringBackendBench.cpp b/folly/io/async/test/IoUringBackendBench.cpp similarity index 100% rename from folly/experimental/io/test/IoUringBackendBench.cpp rename to folly/io/async/test/IoUringBackendBench.cpp diff --git a/folly/experimental/io/test/IoUringBackendSetupTest.cpp b/folly/io/async/test/IoUringBackendSetupTest.cpp similarity index 100% rename from folly/experimental/io/test/IoUringBackendSetupTest.cpp rename to folly/io/async/test/IoUringBackendSetupTest.cpp diff --git a/folly/experimental/io/test/IoUringBackendTest.cpp b/folly/io/async/test/IoUringBackendTest.cpp similarity index 100% rename from folly/experimental/io/test/IoUringBackendTest.cpp rename to folly/io/async/test/IoUringBackendTest.cpp diff --git a/folly/experimental/io/test/IoUringEventBaseLocalTest.cpp b/folly/io/async/test/IoUringEventBaseLocalTest.cpp similarity index 100% rename from folly/experimental/io/test/IoUringEventBaseLocalTest.cpp rename to folly/io/async/test/IoUringEventBaseLocalTest.cpp diff --git a/folly/experimental/io/test/IoUringEventTest.cpp b/folly/io/async/test/IoUringEventTest.cpp similarity index 100% rename from folly/experimental/io/test/IoUringEventTest.cpp rename to folly/io/async/test/IoUringEventTest.cpp diff --git a/folly/experimental/io/test/IoUringTest.cpp b/folly/io/async/test/IoUringTest.cpp similarity index 100% rename from folly/experimental/io/test/IoUringTest.cpp rename to folly/io/async/test/IoUringTest.cpp diff --git a/folly/experimental/io/test/RegisteredFdBenchmark.cpp b/folly/io/async/test/RegisteredFdBenchmark.cpp similarity index 100% rename from folly/experimental/io/test/RegisteredFdBenchmark.cpp rename to folly/io/async/test/RegisteredFdBenchmark.cpp diff --git a/folly/experimental/io/test/SimpleAsyncIOTest.cpp b/folly/io/async/test/SimpleAsyncIOTest.cpp similarity index 100% rename from folly/experimental/io/test/SimpleAsyncIOTest.cpp rename to folly/io/async/test/SimpleAsyncIOTest.cpp