From 4d9aec9c8cedb7c8454023f88ad19cf248fe8bd8 Mon Sep 17 00:00:00 2001 From: Danny Mayer Date: Thu, 1 Aug 2002 03:46:21 +0000 Subject: [PATCH] Redesigned sockets to use I/O Completion Ports and Events and eliminating multiple socket bugs reported --- lib/isc/win32/socket.c | 2248 ++++++++++++++++++++-------------------- 1 file changed, 1116 insertions(+), 1132 deletions(-) diff --git a/lib/isc/win32/socket.c b/lib/isc/win32/socket.c index c68c7bb9b0..f258ca5c27 100644 --- a/lib/isc/win32/socket.c +++ b/lib/isc/win32/socket.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2000-2002 Internet Software Consortium. + * Copyright (C) 2000, 2001 Internet Software Consortium. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -15,7 +15,7 @@ * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: socket.c,v 1.18 2002/05/27 00:40:23 marka Exp $ */ +/* $Id: socket.c,v 1.19 2002/08/01 03:46:21 mayer Exp $ */ #define MAKE_EXTERNAL 1 #include @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -43,18 +44,31 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include #include +#include #include "errno2result.h" -#define MAX_SELECT_SECONDS 0 -#define MAX_SELECT_MILLISECONDS 400 +/* + * Define this macro to control the behavior of connection + * resets on UDP sockets. See Microsoft KnowledgeBase Article Q263823 + * for details. + * NOTE: This requires that Windows 2000 systems install Service Pack 2 + * or later. + */ +#ifndef SIO_UDP_CONNRESET +#define SIO_UDP_CONNRESET _WSAIOW(IOC_VENDOR,12) +#endif + /* * Some systems define the socket length argument as an int, some as size_t, * some as socklen_t. This is here so it can be easily changed if needed. @@ -72,6 +86,7 @@ * work around it here. */ #define SOFT_ERROR(e) ((e) == WSAEINTR || \ + (e) == WSA_IO_PENDING || \ (e) == WSAEWOULDBLOCK || \ (e) == EWOULDBLOCK || \ (e) == EINTR || \ @@ -120,19 +135,6 @@ typedef isc_event_t intev_t; * a setsockopt() like interface to request timestamps, and if the OS * doesn't do it for us, call gettimeofday() on every UDP receive? */ -#ifdef SO_TIMESTAMP -#ifndef USE_CMSG -#define USE_CMSG 1 -#endif -#endif - -/* - * Check to see if we have even basic support for cracking messages from - * the control data returned from/sent via recvmsg()/sendmsg(). - */ -#if defined(USE_CMSG) && (!defined(CMSG_LEN) || !defined(CMSG_SPACE)) -#undef USE_CMSG -#endif /* * We really don't want to try and use these control messages. Win32 @@ -144,10 +146,7 @@ typedef isc_event_t intev_t; * Message header for recvmsg and sendmsg calls. * Used value-result for recvmsg, value only for sendmsg. */ -struct iovec { - void *iov_base; /* starting address of buffer */ - size_t iov_len; /* size of buffer */ -}; + struct msghdr { void *msg_name; /* optional address */ @@ -158,7 +157,7 @@ struct msghdr { u_int msg_controllen; /* ancillary data buffer len */ int msg_flags; /* flags on received message */ } msghdr; - + /* * The number of times a send operation is repeated if the result is EINTR. */ @@ -170,11 +169,19 @@ struct isc_socket { isc_socketmgr_t *manager; isc_mutex_t lock; isc_sockettype_t type; + OVERLAPPED overlapped; + /* Pointers to scatter/gather buffers */ + WSABUF iov[ISC_SOCKET_MAXSCATTERGATHER]; + struct msghdr *messagehdr; + size_t totalBytes; + int iEvent; /* Index into Event Array */ + WSAEVENT hEvent; /* Event Handle */ + long wait_type; /* Events to wait on */ /* Locked by socket lock. */ ISC_LINK(isc_socket_t) link; unsigned int references; - int fd; + SOCKET fd; int pf; ISC_LIST(isc_socketevent_t) send_list; @@ -192,19 +199,66 @@ struct isc_socket { isc_sockaddr_t address; /* remote address */ - unsigned int pending_recv : 1, - pending_send : 1, + unsigned int pending_close : 1, pending_accept : 1, - listener : 1, /* listener socket */ + iocp : 1, /* I/O Completion Port */ + listener : 1, /* listener socket */ connected : 1, connecting : 1, /* connect pending */ - bound : 1; /* bound to local addr */ + bound : 1; /* bound to local addr */ -#ifdef ISC_NET_RECVOVERFLOW - unsigned char overflow; /* used for MSG_TRUNC fake */ -#endif }; +/* + * I/O Completion ports Info structures + */ + +static HANDLE hHeapHandle = NULL; +static int iocp_total = 0; +typedef struct IoCompletionInfo { + OVERLAPPED overlapped; + isc_socketevent_t *dev; + int request_type; +} IoCompletionInfo; + +/* + * Define a maximum number of I/O Completion Port worker threads + * to handle the load on the Completion Port + */ +#define MAX_IOCPTHREADS 20 + +/* + * event_change structure to handle adds and deletes from the list of + * events in the Wait + */ +typedef struct event_change event_change_t; + +struct event_change { + isc_socket_t *sock; + int iEvent; + SOCKET fd; + unsigned int action; + ISC_LINK(event_change_t) link; +}; + +/* + * Note: We are using an array here since *WaitForMultiple* wants an array + */ + +#define MAX_EVENTS 64 + +/* + * List of events being waited on and their associated sockets + */ +typedef struct sock_event_list { + int max_event; + int total_events; + isc_socket_t *aSockList[MAX_EVENTS]; + WSAEVENT aEventList[MAX_EVENTS]; + isc_mutex_t EventLock; + ISC_LIST(event_change_t) event_updates; +} sock_event_list; + #define SOCKET_MANAGER_MAGIC ISC_MAGIC('I', 'O', 'm', 'g') #define VALID_MANAGER(m) ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC) @@ -215,16 +269,15 @@ struct isc_socketmgr { isc_mutex_t lock; /* Locked by manager lock. */ ISC_LIST(isc_socket_t) socklist; - fd_set read_fds; - fd_set write_fds; - fd_set except_fds; - isc_socket_t *fds[FD_SETSIZE]; - int fdstate[FD_SETSIZE]; - int maxfd; - int minfd; + sock_event_list sockev_list; + int event_written; + isc_boolean_t bShutdown; isc_thread_t watcher; isc_condition_t shutdown_ok; - int pipe_fds[2]; + HANDLE hIoCompletionPort; + int maxIOCPThreads; + HANDLE hIOCPThreads[MAX_IOCPTHREADS]; + DWORD dwIOCPThreadIds[MAX_IOCPTHREADS]; }; #define CLOSED 0 /* this one must be zero */ @@ -235,23 +288,21 @@ struct isc_socketmgr { * send() and recv() iovec counts */ #define MAXSCATTERGATHER_SEND (ISC_SOCKET_MAXSCATTERGATHER) -#ifdef ISC_NET_RECVOVERFLOW -# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER + 1) -#else -# define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) -#endif +#define MAXSCATTERGATHER_RECV (ISC_SOCKET_MAXSCATTERGATHER) +static isc_threadresult_t WINAPI SocketIoThread(LPVOID ThreadContext); static void send_recvdone_event(isc_socket_t *, isc_socketevent_t **); static void send_senddone_event(isc_socket_t *, isc_socketevent_t **); static void free_socket(isc_socket_t **); static isc_result_t allocate_socket(isc_socketmgr_t *, isc_sockettype_t, isc_socket_t **); -static void destroy(isc_socket_t **); -static void internal_accept(isc_task_t *, isc_event_t *); -static void internal_connect(isc_task_t *, isc_event_t *); -static void internal_recv(isc_task_t *, isc_event_t *); -static void internal_send(isc_task_t *, isc_event_t *); -static void process_cmsg(isc_socket_t *, struct msghdr *, isc_socketevent_t *); +static isc_result_t +socket_recv(isc_socket_t *, isc_socketevent_t *, isc_task_t *, unsigned int); +static void destroy_socket(isc_socket_t **); +static void internal_accept(isc_socket_t *, int); +static void internal_connect(isc_socket_t *, int); +static void internal_recv(isc_socket_t *, isc_socketevent_t *, int, int); +static void internal_send(isc_socket_t *, isc_socketevent_t *, int, int); static void build_msghdr_send(isc_socket_t *, isc_socketevent_t *, struct msghdr *, char *cmsg, WSABUF *, size_t *); @@ -259,44 +310,401 @@ static void build_msghdr_recv(isc_socket_t *, isc_socketevent_t *, struct msghdr *, char *cmsg, WSABUF *, size_t *); -#define SELECT_POKE_SHUTDOWN (-1) -#define SELECT_POKE_NOTHING (-2) -#define SELECT_POKE_READ (-3) -#define SELECT_POKE_ACCEPT (-3) /* Same as _READ */ -#define SELECT_POKE_WRITE (-4) -#define SELECT_POKE_CONNECT (-5) -#define SELECT_POKE_CLOSE (-6) -long bpipe_written = 0; +enum { + SOCKET_CANCEL, + SOCKET_SHUTDOWN, + SOCKET_RECV, + SOCKET_SEND, + SOCK_ACCEPT +}; + +enum { + EVENT_ADD, + EVENT_DELETE +}; #define SOCK_DEAD(s) ((s)->references == 0) + +#if defined(ISC_SOCKET_DEBUG) /* - * Routine to handle error messages + * This is used to duump the contents of the sock structure + * You should make sure that the sock is locked before + * dumping it. Since the code uses simple printf() statements + * it should only be used interactively. */ - -#define ISC_STRERRORSIZE 128 - void -isc__strerror(int num, char *buf, size_t size) { - char *msg; - unsigned int unum = num; +sock_dump(isc_socket_t *sock) { + isc_socketevent_t *ldev; + isc_socket_newconnev_t *ndev; + isc_sockaddr_t addr; + char socktext[256]; - REQUIRE(buf != NULL); - msg = NTstrerror(num); - if (msg != NULL) - snprintf(buf, size, "%s", msg); - else - snprintf(buf, size, "Unknown error: %u", unum); + isc_socket_getpeername(sock, &addr); + isc_sockaddr_format(&addr, socktext, sizeof(socktext)); + printf("Remote Socket: %s\n", socktext); + isc_socket_getsockname(sock, &addr); + isc_sockaddr_format(&addr, socktext, sizeof(socktext)); + printf("This Socket: %s\n", socktext); + + printf("\n\t\tSock Dump\n"); + printf("\t\tfd: %u\n", sock->fd); + printf("\t\treferences: %d\n", sock->references); + printf("\t\tpending_accept: %d\n", sock->pending_accept); + printf("\t\tpending_close: %d\n", sock->pending_close); + printf("\t\tconnecting: %d\n", sock->connecting); + printf("\t\tconnected: %d\n", sock->connected); + printf("\t\tbound: %d\n", sock->bound); + printf("\t\tiocp: %d\n", sock->iocp); + printf("\t\tsocket type: %d\n", sock->type); + + printf("\n\t\tSock Recv List\n"); + ldev = ISC_LIST_HEAD(sock->recv_list); + while (ldev != NULL) { + printf("\t\tdev: %p\n", ldev); + ldev = ISC_LIST_NEXT(ldev, ev_link); + } + printf("\n\t\tSock Send List\n"); + ldev = ISC_LIST_HEAD(sock->send_list); + while (ldev != NULL) { + printf("\t\tdev: %p\n", ldev); + ldev = ISC_LIST_NEXT(ldev, ev_link); + } + printf("\n\t\tSock Accept List\n"); + ndev = ISC_LIST_HEAD(sock->accept_list); + while (ndev != NULL) { + printf("\t\tdev: %p\n", ldev); + ndev = ISC_LIST_NEXT(ndev, ev_link); + } +} +#endif + +/* This function will add an entry to the I/O completion port + * that will signal the I/O thread to exit (gracefully) + */ +static void +signal_iocompletionport_exit(isc_socketmgr_t *manager) { + int i; + int errval; + char strbuf[ISC_STRERRORSIZE]; + + for (i = 0; i < manager->maxIOCPThreads; i++) { + if (!PostQueuedCompletionStatus(manager->hIoCompletionPort, 0, 0, 0)) { + errval = GetLastError(); + isc__strerror(errval, strbuf, sizeof(strbuf)); + FATAL_ERROR(__FILE__, __LINE__, + isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, + ISC_MSG_FAILED, + "Can't request service thread to exit: %s"), + strbuf); + } + } } +/* + * Create the worker threads for the I/O Completion Port + */ +void +iocompletionport_createthreads(int total_threads, isc_socketmgr_t *manager) { + int errval; + char strbuf[ISC_STRERRORSIZE]; + int i = 0; + + /* + * We need at least one + */ + for (i = 0; i < total_threads; i++) { + manager->hIOCPThreads[i] = CreateThread( NULL, 0, SocketIoThread, + manager, 0, + &manager->dwIOCPThreadIds[i]); + if(manager->hIOCPThreads[i] == NULL) { + errval = GetLastError(); + isc__strerror(errval, strbuf, sizeof(strbuf)); + FATAL_ERROR(__FILE__, __LINE__, + isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, + ISC_MSG_FAILED, + "Can't create IOCP thread: %s"), + strbuf); + } + } +} /* - * The following routines are here to handle Unix emulation until we - * can rewrite the the routines in Winsock2 style. + * Create/initialise the I/O completion port */ +void +iocompletionport_init(isc_socketmgr_t *manager) { + int errval; + char strbuf[ISC_STRERRORSIZE]; + /* + * Create a private heap to handle the socket overlapped structure + * The miniumum number of structures is 10, there is no maximum + */ + hHeapHandle = HeapCreate(0, 10*sizeof(IoCompletionInfo), 0); + manager->maxIOCPThreads = min(isc_os_ncpus() + 1, + MAX_IOCPTHREADS); + /* Now Create the Completion Port */ + manager->hIoCompletionPort = CreateIoCompletionPort( + INVALID_HANDLE_VALUE, NULL, + 0, manager->maxIOCPThreads); + if (manager->hIoCompletionPort == NULL) { + errval = GetLastError(); + isc__strerror(errval, strbuf, sizeof(strbuf)); + FATAL_ERROR(__FILE__, __LINE__, + isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, + ISC_MSG_FAILED, + "CreateIoCompletionPort() failed " + "during initialization: %s"), + strbuf); + exit(1); + } + + /* + * Worker threads for servicing the I/O + */ + iocompletionport_createthreads(manager->maxIOCPThreads, manager); +} + + +void +iocompletionport_exit(isc_socketmgr_t *manager) { + if (manager->hIoCompletionPort != NULL) { + /* Get each of the service threads to exit + */ + signal_iocompletionport_exit(manager); + } +} + +/* + * Add sockets in here and pass the sock data in as part of the information needed + */ +void +iocompletionport_update(isc_socket_t *sock) { + + HANDLE hiocp; + if(sock->iocp == 0) { + sock->iocp = 1; + hiocp = CreateIoCompletionPort((HANDLE) sock->fd, + sock->manager->hIoCompletionPort, (DWORD) sock, + sock->manager->maxIOCPThreads); + InterlockedIncrement(&iocp_total); + + } +} + +void +socket_event_minit(sock_event_list *evlist) { + BOOL bReset; + int i; + + /* Initialize the Event List */ + evlist->max_event = 0; + evlist->total_events = 0; + for (i = 0; i < MAX_EVENTS; i++) { + evlist->aSockList[i] = NULL; + evlist->aEventList[i] = (WSAEVENT) 0; + } + + /* + * initialize the lock + */ + if (isc_mutex_init(&(evlist->EventLock)) != ISC_R_SUCCESS) { + UNEXPECTED_ERROR(__FILE__, __LINE__, + "isc_mutex_init() %s Event Lock", + isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, "failed")); + } + evlist->aEventList[0] = WSACreateEvent(); + (evlist->max_event)++; + bReset = WSAResetEvent(evlist->aEventList[0]); + + ISC_LIST_INIT(evlist->event_updates); +} + +/* + * Note that the eventLock is already locked before calling this function + */ +isc_result_t +socket_eventlist_add(isc_socket_t *sock, sock_event_list *evlist) { + int max_event; + + REQUIRE(sock != NULL); + REQUIRE(sock->hEvent != NULL); + REQUIRE(evlist != NULL); + + max_event = evlist->max_event; + if(max_event >= MAX_EVENTS) { + return(ISC_R_NOSPACE); + } + + sock->iEvent = max_event; + evlist->aSockList[max_event] = sock; + evlist->aEventList[max_event] = sock->hEvent; + evlist->max_event++; + evlist->total_events++; + return (ISC_R_SUCCESS); +} +/* + * Note that the eventLock is locked before calling this function + * All Events and associated sockets are closed here + */ +void +socket_eventlist_delete(isc_socket_t *sock, SOCKET fd, int iEvent, + sock_event_list *evlist) { + int i; + WSAEVENT hEvent; + + REQUIRE(evlist != NULL); + REQUIRE(iEvent > 0); + + hEvent = evlist->aEventList[iEvent]; + if (hEvent != NULL) + WSACloseEvent(hEvent); + + + for(i = iEvent; i < evlist->max_event; i++) { + evlist->aEventList[i] = evlist->aEventList[i + 1]; + evlist->aSockList[i] = evlist->aSockList[i + 1]; + if(evlist->aSockList[i] != NULL) + evlist->aSockList[i]->iEvent = i; + } + evlist->aEventList[evlist->max_event] = 0; + evlist->aSockList[evlist->max_event] = NULL; + if(sock != NULL) { + sock->iEvent = 0; + sock->pending_close = 1; + } + if (fd >= 0) + closesocket(fd); + evlist->max_event--; + evlist->total_events--; + +} +/* + * Get the event changes off of the list and apply the + * requested changes + */ +isc_result_t +process_eventlist(sock_event_list *evlist, isc_socketmgr_t *manager) { + event_change_t *evchange; + + evchange = ISC_LIST_HEAD(evlist->event_updates); + while (evchange != NULL) { + switch (evchange->action) { + case EVENT_ADD: + socket_eventlist_add(evchange->sock, evlist); + break; + case EVENT_DELETE: + socket_eventlist_delete(evchange->sock, evchange->fd, + evchange->iEvent, evlist); + break; + default: + break; + } + + ISC_LIST_DEQUEUE(evlist->event_updates, evchange, link); + HeapFree(hHeapHandle, 0, evchange); + manager->event_written--; + evchange = ISC_LIST_HEAD(evlist->event_updates); + } + manager->event_written = 0; + return (ISC_R_SUCCESS); +} +/* + * Add the event list changes to the queue and notify the + * event loop + */ +static void +notify_eventlist(isc_socket_t *sock, sock_event_list *evlist, + unsigned int action) { + + event_change_t *evchange; + + evchange = HeapAlloc(hHeapHandle, HEAP_ZERO_MEMORY, + sizeof(event_change_t)); + evchange->sock = sock; + evchange->action = action; + evchange->iEvent = sock->iEvent; + evchange->fd = sock->fd; + + LOCK(&evlist->EventLock); + ISC_LIST_APPEND(evlist->event_updates, evchange, link); + + sock->manager->event_written++; + UNLOCK(&evlist->EventLock); + WSASetEvent(evlist->aEventList[0]); /* Alert the Wait List */ +} +/* + * Note that the socket is already locked before calling this function + */ +isc_result_t +socket_event_add(isc_socket_t *sock, long type) { + int stat; + WSAEVENT hEvent; + sock_event_list *evlist; + + REQUIRE(sock != NULL); + evlist = &(sock->manager->sockev_list); + + hEvent = WSACreateEvent(); + if (hEvent == WSA_INVALID_EVENT) { + stat = WSAGetLastError(); + return (ISC_R_UNEXPECTED); + } + if (WSAEventSelect(sock->fd, hEvent, type) != 0) { + stat = WSAGetLastError(); + WSACloseEvent(hEvent); + return (ISC_R_UNEXPECTED); + } + sock->hEvent = hEvent; + + sock->wait_type = type; + notify_eventlist(sock, evlist, EVENT_ADD); + return (ISC_R_SUCCESS); +} +/* + * Note that the socket is not locked before calling this function + */ +void +socket_event_delete(isc_socket_t *sock) { + sock_event_list *evlist; + + REQUIRE(sock != NULL); + REQUIRE(sock->hEvent != NULL); + evlist = &(sock->manager->sockev_list); + + if (sock->hEvent != NULL) { + sock->wait_type = 0; + sock->pending_close = 1; + notify_eventlist(sock, evlist, EVENT_DELETE); + sock->hEvent = NULL; + } + +} +/* + * Routine to cleanup and then close the socket. + * Only close the socket here if it is NOT associated + * with an event, otherwise the WSAWaitForMultipleEvents + * may fail due to the fact that the the Wait should not + * be running while closing an event or a socket. + */ +void +socket_close(isc_socket_t *sock) { + sock->pending_close = 1; + if (sock->hEvent != NULL) + socket_event_delete(sock); + else { + closesocket(sock->fd); + } + if (sock->iocp) { + sock->iocp = 0; + InterlockedDecrement(&iocp_total); + } + +} /* * Initialize socket services */ @@ -311,102 +719,110 @@ BOOL InitSockets() { err = WSAStartup(wVersionRequested, &wsaData); if ( err != 0 ) { /* Tell the user that we could not find a usable Winsock DLL */ - return (FALSE); + NTReportError("named", + "Application Requires Winsock 2.0 or later. Exiting"); + return(FALSE); } - - return (TRUE); + return(TRUE); } int -internal_pipe(int filedes[2]) { - int status; - unsigned int pipesize = 65535; - int mode = _O_TEXT; - - status = _pipe(filedes, pipesize, mode); - return (status); - InterlockedExchange(&bpipe_written, 0); -} - -int -internal_sendmsg(int sock, const struct msghdr *msg, int flags, int *merror) { +internal_sendmsg(isc_socket_t *sock, IoCompletionInfo *lpo, int flags, + int *Error) { + int Result; DWORD BytesSent; DWORD Flags = flags; + int total_sent; - *merror = WSASendTo((SOCKET) sock, - msg->msg_iov, - msg->msg_iovlen, - &BytesSent, - Flags, - msg->msg_name, - msg->msg_namelen, - NULL, - NULL); + *Error = 0; + Result = WSASendTo((SOCKET) sock->fd, + sock->messagehdr->msg_iov, + sock->messagehdr->msg_iovlen, + &BytesSent, + Flags, + sock->messagehdr->msg_name, + sock->messagehdr->msg_namelen, + (LPOVERLAPPED) lpo, + NULL); + + total_sent = (int) BytesSent; - if (*merror == SOCKET_ERROR) { - BytesSent = -1; - /* There is an error... */ - *merror = WSAGetLastError(); - if (*merror == WSA_IO_PENDING) { - /* Overlapped send successfully initiated. */ - errno = EAGAIN; - } else { - /* An unexpected error occurred. */ - errno = *merror; - } - } + /* Check for errors.*/ + if (Result == SOCKET_ERROR) { - /* No error -- the I/O request was completed immediately... */ - return (BytesSent); + *Error = WSAGetLastError(); + + switch (*Error) { + + case NO_ERROR : + case WSA_IO_INCOMPLETE : + case WSA_WAIT_IO_COMPLETION : + case WSA_IO_PENDING : + break ; + + default : + return (-1); + break; + } + } + if(lpo != NULL) + return (0); + else + return (total_sent); } int -internal_recvmsg(int sock, struct msghdr *msg, int flags, int *merror) { - DWORD Flags = flags; - DWORD NumBytes; +internal_recvmsg(isc_socket_t *sock, IoCompletionInfo *lpo, int flags, + int *Error) { + DWORD Flags = 0; + DWORD NumBytes = 0; + int total_bytes = 0; int Result; - Result = WSARecvFrom((SOCKET) sock, - msg->msg_iov, - msg->msg_iovlen, - &NumBytes, - &Flags, - msg->msg_name, - (int *)&(msg->msg_namelen), - NULL, - NULL); + *Error = 0; + Result = WSARecvFrom((SOCKET) sock->fd, + sock->messagehdr->msg_iov, + sock->messagehdr->msg_iovlen, + &NumBytes, + &Flags, + sock->messagehdr->msg_name, + (int *)&(sock->messagehdr->msg_namelen), + (LPOVERLAPPED) lpo, + NULL); + total_bytes = (int) NumBytes; /* Check for errors. */ if (Result == SOCKET_ERROR) { - *merror = WSAGetLastError(); - NumBytes = -1; - - switch (*merror) { - case WSAEWOULDBLOCK: - /* - * No data received; return to wait for another - * read event. - */ - errno = EAGAIN; - break; - default: - /* Some other error... hit the panic button. */ - errno = *merror; - break; - } + *Error = WSAGetLastError(); + + switch (*Error) { + + case NO_ERROR : + case WSA_IO_INCOMPLETE : + case WSA_WAIT_IO_COMPLETION : + case WSA_IO_PENDING : + break ; + + default : + return (-1); + break; + } } - msg->msg_flags = Flags; /* Return the flags received in header */ - - return (NumBytes); -} + /* Return the flags received in header */ + sock->messagehdr->msg_flags = Flags; + if(lpo != NULL) + return (-1); + else + return (total_bytes); + +} static void manager_log(isc_socketmgr_t *sockmgr, isc_logcategory_t *category, isc_logmodule_t *module, int level, - const char *fmt, ...) -{ + const char *fmt, ...) { char msgbuf[2048]; va_list ap; @@ -430,8 +846,7 @@ static void socket_log(isc_socket_t *sock, isc_sockaddr_t *address, isc_logcategory_t *category, isc_logmodule_t *module, int level, isc_msgcat_t *msgcat, int msgset, int message, - const char *fmt, ...) -{ + const char *fmt, ...) { char msgbuf[2048]; char peerbuf[256]; va_list ap; @@ -448,134 +863,27 @@ socket_log(isc_socket_t *sock, isc_sockaddr_t *address, msgcat, msgset, message, "socket %p: %s", sock, msgbuf); } else { - isc_sockaddr_format(address, peerbuf, sizeof(peerbuf)); + isc_sockaddr_format(address, peerbuf, sizeof peerbuf); isc_log_iwrite(isc_lctx, category, module, level, msgcat, msgset, message, "socket %p %s: %s", sock, peerbuf, msgbuf); } } -static void -wakeup_socket(isc_socketmgr_t *manager, int fd, int msg) { - isc_socket_t *sock; - - /* - * This is a wakeup on a socket. If the socket is not in the - * process of being closed, start watching it for either reads - * or writes. - */ - - INSIST(fd >= 0 && fd < FD_SETSIZE); - - if (manager->fdstate[fd] == CLOSE_PENDING) { - manager->fdstate[fd] = CLOSED; - FD_CLR(fd, &manager->read_fds); - FD_CLR(fd, &manager->write_fds); - FD_CLR(fd, &manager->except_fds); - closesocket(fd); - return; - } - if (manager->fdstate[fd] != MANAGED) - return; - - sock = manager->fds[fd]; - - /* - * If there are no events, or there is an event but we - * have already queued up the internal event on a task's - * queue, clear the bit. Otherwise, set it. - */ - if (msg == SELECT_POKE_READ) - FD_SET(sock->fd, &manager->read_fds); - if (msg == SELECT_POKE_WRITE) - FD_SET(sock->fd, &manager->write_fds); - if (msg == SELECT_POKE_CONNECT) { /* Need both here */ - FD_SET(sock->fd, &manager->write_fds); - FD_SET(sock->fd, &manager->except_fds); - } -} - /* - * Poke the select loop when there is something for us to do. - * The write is required (by POSIX) to complete. That is, we - * will not get partial writes. - */ -static void -select_poke(isc_socketmgr_t *mgr, int fd, int msg) { - int cc; - int buf[2]; - char strbuf[ISC_STRERRORSIZE]; - - buf[0] = fd; - buf[1] = msg; - - if (msg == SELECT_POKE_SHUTDOWN) { - do { - cc = _write(mgr->pipe_fds[1], buf, sizeof(buf)); - } while (cc < 0 && SOFT_ERROR(errno)); - - if (cc < 0) { - isc__strerror(errno, strbuf, sizeof(strbuf)); - FATAL_ERROR(__FILE__, __LINE__, - isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, - ISC_MSG_WRITEFAILED, - "write() failed " - "during watcher poke: %s"), - strbuf); - } - - INSIST(cc == sizeof(buf)); - - InterlockedIncrement(&bpipe_written); - } else { - wakeup_socket(mgr, fd, msg); - } -} - -/* - * Read a message on the internal fd. - */ -static void -select_readmsg(isc_socketmgr_t *mgr, int *fd, int *msg) { - int buf[2]; - int cc; - char strbuf[ISC_STRERRORSIZE]; - - cc = _read(mgr->pipe_fds[0], buf, sizeof(buf)); - if (cc < 0) { - *msg = SELECT_POKE_NOTHING; - if (SOFT_ERROR(errno)) - return; - - isc__strerror(errno, strbuf, sizeof(strbuf)); - FATAL_ERROR(__FILE__, __LINE__, - isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, - ISC_MSG_READFAILED, - "read() failed " - "during watcher poke: %s"), - strbuf); - - return; - } - INSIST(cc == sizeof(buf)); - *fd = buf[0]; - *msg = buf[1]; - -} -/* - * Make a fd non-blocking. + * Make an fd SOCKET non-blocking. */ static isc_result_t -make_nonblock(int fd) { +make_nonblock(SOCKET fd) { int ret; unsigned long flags = 1; char strbuf[ISC_STRERRORSIZE]; /* Set the socket to non-blocking */ - ret = ioctlsocket((SOCKET) fd, FIONBIO, &flags); + ret = ioctlsocket(fd, FIONBIO, &flags); if (ret == -1) { - isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf)); + isc__strerror(errno, strbuf, sizeof(strbuf)); UNEXPECTED_ERROR(__FILE__, __LINE__, "ioctlsocket(%d, FIOBIO, %d): %s", fd, flags, strbuf); @@ -585,103 +893,33 @@ make_nonblock(int fd) { return (ISC_R_SUCCESS); } - /* - * Process control messages received on a socket. + * Windows 2000 systems incorrectly cause UDP sockets using WASRecvFrom + * to not work correctly, returning a WSACONNRESET error when a WSASendTo + * fails with an "ICMP port unreachable" response and preventing the + * socket from using the WSARecvFrom in subsequent operations. + * The function below fixes this, but requires that Windows 2000 + * Service Pack 2 or later be installed on the system. NT 4.0 + * systems are not affected by this and work correctly. + * See Microsoft Knowledge Base Article Q263823 for details of this. */ -static void -process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { -#ifdef USE_CMSG - struct cmsghdr *cmsgp; -#ifdef ISC_PLATFORM_HAVEIPV6 - struct in6_pktinfo *pktinfop; -#endif -#ifdef SO_TIMESTAMP - struct timeval *timevalp; -#endif -#endif +isc_result_t +connection_reset_fix(SOCKET fd) { + DWORD dwBytesReturned = 0; + BOOL bNewBehavior = FALSE; + DWORD status; - /* - * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined. - * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined. - * They are all here, outside of the CPP tests, because it is - * more consistent with the usual ISC coding style. - */ - UNUSED(sock); - UNUSED(msg); - UNUSED(dev); - -#ifndef ISC_NET_BSD44MSGHDR - return; - -#else /* defined ISC_NET_BSD44MSGHDR */ - -#ifdef MSG_TRUNC - if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC) - dev->attributes |= ISC_SOCKEVENTATTR_TRUNC; -#endif - -#ifdef MSG_CTRUNC - if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC) - dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC; -#endif - -#ifndef USE_CMSG - return; -#else - if (msg->msg_controllen == 0 || msg->msg_control == NULL) - return; - -#ifdef SO_TIMESTAMP - timevalp = NULL; -#endif -#ifdef ISC_PLATFORM_HAVEIPV6 - pktinfop = NULL; -#endif - - cmsgp = CMSG_FIRSTHDR(msg); - while (cmsgp != NULL) { - socket_log(sock, NULL, TRACE, - isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG, - "processing cmsg %p", cmsgp); - -#ifdef ISC_PLATFORM_HAVEIPV6 - if (cmsgp->cmsg_level == IPPROTO_IPV6 - && cmsgp->cmsg_type == IPV6_PKTINFO) { - - pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); - memcpy(&dev->pktinfo, pktinfop, - sizeof(struct in6_pktinfo)); - dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO; - socket_log(sock, NULL, TRACE, - isc_msgcat, ISC_MSGSET_SOCKET, - ISC_MSG_IFRECEIVED, - "interface received on ifindex %u", - dev->pktinfo.ipi6_ifindex); - if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr)) - dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST; - goto next; - } -#endif - -#ifdef SO_TIMESTAMP - if (cmsgp->cmsg_level == SOL_SOCKET - && cmsgp->cmsg_type == SCM_TIMESTAMP) { - timevalp = (struct timeval *)CMSG_DATA(cmsgp); - dev->timestamp.seconds = timevalp->tv_sec; - dev->timestamp.nanoseconds = timevalp->tv_usec * 1000; - dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP; - goto next; - } -#endif - - next: - cmsgp = CMSG_NXTHDR(msg, cmsgp); - } -#endif /* USE_CMSG */ - -#endif /* ISC_NET_BSD44MSGHDR */ + if(isc_win32os_majorversion() < 5) + return (ISC_R_SUCCESS); /* NT 4.0 has no problem */ + /* disable new behavior using IOCTL: SIO_UDP_CONNRESET */ + status = WSAIoctl(fd, SIO_UDP_CONNRESET, &bNewBehavior, + sizeof(bNewBehavior), NULL, 0, + &dwBytesReturned, NULL, NULL); + if (status != SOCKET_ERROR) + return (ISC_R_SUCCESS); + else + return (ISC_R_UNEXPECTED); } /* @@ -699,18 +937,13 @@ process_cmsg(isc_socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) { static void build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev, struct msghdr *msg, char *cmsg, - WSABUF *iov, size_t *write_countp) -{ + WSABUF *iov, size_t *write_countp) { unsigned int iovcount; isc_buffer_t *buffer; isc_region_t used; size_t write_count; size_t skip_count; -#ifndef USE_CMSG - UNUSED(cmsg); -#endif - memset(msg, 0, sizeof (*msg)); if (sock->type == isc_sockettype_udp) { @@ -757,7 +990,7 @@ build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev, if (used.length > 0) { iov[iovcount].buf = (void *)(used.base - + skip_count); + + skip_count); iov[iovcount].len = used.length - skip_count; write_count += (used.length - skip_count); skip_count = 0; @@ -772,34 +1005,6 @@ build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev, msg->msg_iov = iov; msg->msg_iovlen = iovcount; -#ifdef ISC_NET_BSD44MSGHDR - msg->msg_control = NULL; - msg->msg_controllen = 0; - msg->msg_flags = 0; -#if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIPV6) - if ((sock->type == isc_sockettype_udp) - && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) { - struct cmsghdr *cmsgp; - struct in6_pktinfo *pktinfop; - - socket_log(sock, NULL, TRACE, - isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA, - "sendto pktinfo data, ifindex %u", - dev->pktinfo.ipi6_ifindex); - - msg->msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo)); - msg->msg_control = (void *)cmsg; - - cmsgp = (struct cmsghdr *)cmsg; - cmsgp->cmsg_level = IPPROTO_IPV6; - cmsgp->cmsg_type = IPV6_PKTINFO; - cmsgp->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); - pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp); - memcpy(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo)); - } -#endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */ -#endif /* ISC_NET_BSD44MSGHDR */ - if (write_countp != NULL) *write_countp = write_count; } @@ -819,27 +1024,18 @@ build_msghdr_send(isc_socket_t *sock, isc_socketevent_t *dev, static void build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev, struct msghdr *msg, char *cmsg, - WSABUF *iov, size_t *read_countp) -{ + WSABUF *iov, size_t *read_countp) { unsigned int iovcount; isc_buffer_t *buffer; isc_region_t available; size_t read_count; -#ifndef USE_CMSG - UNUSED(cmsg); -#endif - memset(msg, 0, sizeof (struct msghdr)); if (sock->type == isc_sockettype_udp) { memset(&dev->address, 0, sizeof(dev->address)); msg->msg_name = (void *)&dev->address.type.sa; msg->msg_namelen = sizeof(dev->address.type); -#ifdef ISC_NET_RECVOVERFLOW - /* If needed, steal one iovec for overflow detection. */ - maxiov--; -#endif } else { /* TCP */ msg->msg_name = NULL; msg->msg_namelen = 0; @@ -858,10 +1054,10 @@ build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev, iov[0].len = read_count; iovcount = 1; } else { - /* - * Multibuffer I/O. - * Skip empty buffers. - */ + /* + * Multibuffer I/O. + * Skip empty buffers. + */ while (buffer != NULL) { REQUIRE(ISC_BUFFER_VALID(buffer)); if (isc_buffer_availablelength(buffer) != 0) @@ -890,37 +1086,17 @@ build_msghdr_recv(isc_socket_t *sock, isc_socketevent_t *dev, * we know there is at least one iov left, since we stole it * at the top of this function. */ -#ifdef ISC_NET_RECVOVERFLOW - if (sock->type == isc_sockettype_udp) { - iov[iovcount].iov_base = (void *)(&sock->overflow); - iov[iovcount].iov_len = 1; - iovcount++; - } -#endif msg->msg_iov = iov; msg->msg_iovlen = iovcount; -#ifdef ISC_NET_BSD44MSGHDR - msg->msg_control = NULL; - msg->msg_controllen = 0; - msg->msg_flags = 0; -#if defined(USE_CMSG) - if (sock->type == isc_sockettype_udp) { - msg->msg_control = cmsg; - msg->msg_controllen = CMSG_BUF_SIZE; - } -#endif /* USE_CMSG */ -#endif /* ISC_NET_BSD44MSGHDR */ - if (read_countp != NULL) *read_countp = read_count; } static void set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock, - isc_socketevent_t *dev) -{ + isc_socketevent_t *dev) { if (sock->type == isc_sockettype_udp) { if (address != NULL) dev->address = *address; @@ -934,8 +1110,7 @@ set_dev_address(isc_sockaddr_t *address, isc_socket_t *sock, static isc_socketevent_t * allocate_socketevent(isc_socket_t *sock, isc_eventtype_t eventtype, - isc_taskaction_t action, const void *arg) -{ + isc_taskaction_t action, const void *arg) { isc_socketevent_t *ev; ev = (isc_socketevent_t *)isc_event_allocate(sock->manager->mctx, @@ -962,17 +1137,13 @@ static void dump_msg(struct msghdr *msg, isc_socket_t *sock) { unsigned int i; - printf("MSGHDR %p, Socket #: %d\n", msg, sock->fd); + printf("MSGHDR %p, Socket #: %u\n", msg, sock->fd); printf("\tname %p, namelen %d\n", msg->msg_name, msg->msg_namelen); printf("\tiov %p, iovlen %d\n", msg->msg_iov, msg->msg_iovlen); for (i = 0 ; i < (unsigned int)msg->msg_iovlen ; i++) printf("\t\t%d\tbase %p, len %d\n", i, msg->msg_iov[i].buf, msg->msg_iov[i].len); -#ifdef ISC_NET_BSD44MSGHDR - printf("\tcontrol %p, controllen %d\n", msg->msg_control, - msg->msg_controllen); -#endif } #endif @@ -982,41 +1153,10 @@ dump_msg(struct msghdr *msg, isc_socket_t *sock) { #define DOIO_EOF 3 /* EOF, no event sent */ static int -doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { - int cc; - WSABUF iov[MAXSCATTERGATHER_RECV]; - size_t read_count; +completeio_recv(isc_socket_t *sock, isc_socketevent_t *dev, int cc, + int recv_errno) { size_t actual_count; - struct msghdr msghdr; isc_buffer_t *buffer; - int recv_errno = 0; -#if USE_CMSG - char cmsg[CMSG_BUF_SIZE]; -#else - char *cmsg = NULL; -#endif - char strbuf[ISC_STRERRORSIZE]; - - build_msghdr_recv(sock, dev, &msghdr, cmsg, iov, &read_count); - -#if defined(ISC_SOCKET_DEBUG) - dump_msg(&msghdr,sock); -#endif - - cc = internal_recvmsg(sock->fd, &msghdr, 0, &recv_errno); - - if (cc < 0) { - if (SOFT_ERROR(recv_errno)) - return (DOIO_SOFT); - - if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { - isc__strerror(recv_errno, strbuf, sizeof(strbuf)); - socket_log(sock, NULL, IOEVENT, - isc_msgcat, ISC_MSGSET_SOCKET, - ISC_MSG_DOIORECV, - "doio_recv: recvmsg(%d) %d bytes, err %d/%s", - sock->fd, cc, recv_errno, strbuf); - } #define SOFT_OR_HARD(_system, _isc) \ if (recv_errno == _system) { \ @@ -1026,19 +1166,30 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { } \ return (DOIO_SOFT); \ } + #define ALWAYS_HARD(_system, _isc) \ if (recv_errno == _system) { \ dev->result = _isc; \ return (DOIO_HARD); \ } + if (recv_errno != 0) { + + if (SOFT_ERROR(recv_errno)) + return (DOIO_SOFT); + SOFT_OR_HARD(WSAECONNREFUSED, ISC_R_CONNREFUSED); SOFT_OR_HARD(WSAENETUNREACH, ISC_R_NETUNREACH); SOFT_OR_HARD(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH); SOFT_OR_HARD(WSAECONNRESET, ISC_R_CONNECTIONRESET); SOFT_OR_HARD(WSAENETRESET, ISC_R_CONNECTIONRESET); + SOFT_OR_HARD(WSAECONNABORTED, ISC_R_CONNECTIONRESET); SOFT_OR_HARD(WSAEDISCON, ISC_R_CONNECTIONRESET); SOFT_OR_HARD(WSAENETDOWN, ISC_R_NETDOWN); + ALWAYS_HARD(ERROR_OPERATION_ABORTED, ISC_R_CONNECTIONRESET); + ALWAYS_HARD(ERROR_PORT_UNREACHABLE, ISC_R_HOSTUNREACH); + ALWAYS_HARD(ERROR_HOST_UNREACHABLE, ISC_R_HOSTUNREACH); + ALWAYS_HARD(ERROR_NETWORK_UNREACHABLE, ISC_R_NETUNREACH); ALWAYS_HARD(WSAENOBUFS, ISC_R_NORESOURCES); #undef SOFT_OR_HARD @@ -1057,7 +1208,7 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { return (DOIO_EOF); if (sock->type == isc_sockettype_udp) { - dev->address.length = msghdr.msg_namelen; + dev->address.length = sock->messagehdr->msg_namelen; if (isc_sockaddr_getport(&dev->address) == 0) { if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { socket_log(sock, &dev->address, IOEVENT, @@ -1085,13 +1236,6 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { } #endif - /* - * If there are control messages attached, run through them and pull - * out the interesting bits. - */ - if (sock->type == isc_sockettype_udp) - process_cmsg(sock, &msghdr, dev); - /* * update the buffers (if any) and the i/o count */ @@ -1119,7 +1263,7 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { * If we read less than we expected, update counters, * and let the upper layer poke the descriptor. */ - if (((size_t)cc != read_count) && (dev->n < dev->minimum)) + if (((size_t)cc != sock->totalBytes) && (dev->n < dev->minimum)) return (DOIO_SOFT); /* @@ -1128,7 +1272,54 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { dev->result = ISC_R_SUCCESS; return (DOIO_SUCCESS); } +static int +startio_recv(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes, + BOOL bwait, int *recv_errno) { + char *cmsg = NULL; + char strbuf[ISC_STRERRORSIZE]; + IoCompletionInfo *lpo; + int status; + if (!bwait) { + lpo = (IoCompletionInfo *) HeapAlloc(hHeapHandle, + HEAP_ZERO_MEMORY, sizeof(IoCompletionInfo)); + lpo->request_type = SOCKET_RECV; + lpo->dev = dev; + } else { /* Wait for recv to complete */ + lpo = NULL; + } + sock->references++; + + build_msghdr_recv(sock, dev, sock->messagehdr, cmsg, sock->iov, + &(sock->totalBytes)); + +#if defined(ISC_SOCKET_DEBUG) + dump_msg(sock->messagehdr, sock); +#endif + + *nbytes = internal_recvmsg(sock, lpo, 0, recv_errno); + + if (*nbytes < 0) { + if (SOFT_ERROR(*recv_errno)) + return (DOIO_SOFT); + + if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { + isc__strerror(*recv_errno, strbuf, sizeof(strbuf)); + socket_log(sock, NULL, IOEVENT, + isc_msgcat, ISC_MSGSET_SOCKET, + ISC_MSG_DOIORECV, + "startio_recv: recvmsg(%d) %d bytes, err %d/%s", + sock->fd, *nbytes, *recv_errno, strbuf); + } + status = completeio_recv(sock, dev, *nbytes, *recv_errno); + if(status != DOIO_SOFT) { + sock->references--; + } + return (status); + } + dev->result = ISC_R_SUCCESS; + return(DOIO_SOFT); +} /* * Returns: * DOIO_SUCCESS The operation succeeded. dev->result contains @@ -1143,32 +1334,12 @@ doio_recv(isc_socket_t *sock, isc_socketevent_t *dev) { * No other return values are possible. */ static int -doio_send(isc_socket_t *sock, isc_socketevent_t *dev) { - int cc; - WSABUF iov[MAXSCATTERGATHER_SEND]; - size_t write_count; - struct msghdr msghdr; +completeio_send(isc_socket_t *sock, isc_socketevent_t *dev, int cc, + int send_errno) { char addrbuf[ISC_SOCKADDR_FORMATSIZE]; -#if USE_CMSG - char cmsg[CMSG_BUF_SIZE]; -#else - char *cmsg = NULL; -#endif - int attempts = 0; - int send_errno = 0; char strbuf[ISC_STRERRORSIZE]; - build_msghdr_send(sock, dev, &msghdr, cmsg, iov, &write_count); - -resend: - cc = internal_sendmsg(sock->fd, &msghdr, 0, &send_errno); - - /* - * Check for error or block condition. - */ - if (cc < 0) { - if (send_errno == WSAEINTR && ++attempts < NRETRIES) - goto resend; + if(send_errno != 0) { if (SOFT_ERROR(send_errno)) return (DOIO_SOFT); @@ -1192,9 +1363,14 @@ resend: SOFT_OR_HARD(WSAECONNREFUSED, ISC_R_CONNREFUSED); SOFT_OR_HARD(WSAENOTCONN, ISC_R_CONNREFUSED); SOFT_OR_HARD(WSAECONNRESET, ISC_R_CONNECTIONRESET); + SOFT_OR_HARD(WSAECONNABORTED, ISC_R_CONNECTIONRESET); SOFT_OR_HARD(WSAENETRESET, ISC_R_CONNECTIONRESET); SOFT_OR_HARD(WSAEDISCON, ISC_R_CONNECTIONRESET); SOFT_OR_HARD(WSAENETDOWN, ISC_R_NETDOWN); + ALWAYS_HARD(ERROR_OPERATION_ABORTED, ISC_R_CONNECTIONRESET); + ALWAYS_HARD(ERROR_PORT_UNREACHABLE, ISC_R_HOSTUNREACH); + ALWAYS_HARD(ERROR_HOST_UNREACHABLE, ISC_R_HOSTUNREACH); + ALWAYS_HARD(ERROR_NETWORK_UNREACHABLE, ISC_R_NETUNREACH); ALWAYS_HARD(WSAEADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL); ALWAYS_HARD(WSAEHOSTUNREACH, ISC_R_HOSTUNREACH); ALWAYS_HARD(WSAEHOSTDOWN, ISC_R_HOSTUNREACH); @@ -1217,23 +1393,17 @@ resend: */ isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf)); isc__strerror(send_errno, strbuf, sizeof(strbuf)); - UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s", + UNEXPECTED_ERROR(__FILE__, __LINE__, "completeio_send: %s: %s", addrbuf, strbuf); dev->result = isc__errno2result(send_errno); - return (DOIO_HARD); + return (DOIO_HARD); } - if (cc == 0) - UNEXPECTED_ERROR(__FILE__, __LINE__, - "internal_send: send() %s 0", - isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, - ISC_MSG_RETURNED, "returned")); - /* * If we write less than we expected, update counters, poke. */ dev->n += cc; - if ((size_t)cc != write_count) + if ((size_t)cc != sock->totalBytes) return (DOIO_SOFT); /* @@ -1243,7 +1413,50 @@ resend: dev->result = ISC_R_SUCCESS; return (DOIO_SUCCESS); } +static int +startio_send(isc_socket_t *sock, isc_socketevent_t *dev, int *nbytes, + BOOL bwait, int *send_errno) { + char *cmsg = NULL; + char strbuf[ISC_STRERRORSIZE]; + IoCompletionInfo *lpo; + int status; + if (!bwait) { + lpo = (IoCompletionInfo *) HeapAlloc(hHeapHandle, + HEAP_ZERO_MEMORY, sizeof(IoCompletionInfo)); + lpo->request_type = SOCKET_SEND; + lpo->dev = dev; + } else { /* Wait for send to complete */ + lpo = NULL; + } + sock->references++; + + build_msghdr_send(sock, dev, sock->messagehdr, cmsg, sock->iov, + &(sock->totalBytes)); + + *nbytes = internal_sendmsg(sock, lpo, 0, send_errno); + + if (*nbytes < 0) { + if (SOFT_ERROR(*send_errno)) + return (DOIO_SOFT); + + if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) { + isc__strerror(*send_errno, strbuf, sizeof(strbuf)); + socket_log(sock, NULL, IOEVENT, + isc_msgcat, ISC_MSGSET_SOCKET, + ISC_MSG_INTERNALSEND, + "startio_send: internal_sendmsg(%d) %d bytes, err %d/%s", + sock->fd, *nbytes, *send_errno, strbuf); + } + status = completeio_send(sock, dev, *nbytes, *send_errno); + if(status != DOIO_SOFT) { + sock->references--; + } + return (status); + } + dev->result = ISC_R_SUCCESS; + return(DOIO_SOFT); +} /* * Kill. * @@ -1251,34 +1464,33 @@ resend: * references exist. */ static void -destroy(isc_socket_t **sockp) { +destroy_socket(isc_socket_t **sockp) { isc_socket_t *sock = *sockp; isc_socketmgr_t *manager = sock->manager; socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, - ISC_MSG_DESTROYING, "destroying"); + ISC_MSG_DESTROYING, "destroying socket %d", sock->fd); INSIST(ISC_LIST_EMPTY(sock->accept_list)); INSIST(ISC_LIST_EMPTY(sock->recv_list)); INSIST(ISC_LIST_EMPTY(sock->send_list)); INSIST(sock->connect_ev == NULL); - REQUIRE(sock->fd >= 0 && sock->fd < FD_SETSIZE); + REQUIRE(sock->fd >= 0); LOCK(&manager->lock); /* - * No one has this socket open, so the watcher doesn't have to be - * poked, and the socket doesn't have to be locked. + * No one has this socket open and the socket doesn't have to be + * locked. The socket_close function makes sure that if needed + * the event_wait loop removes any associated event from the list + * of events being waited on. */ - manager->fds[sock->fd] = NULL; - manager->fdstate[sock->fd] = CLOSE_PENDING; - select_poke(manager, sock->fd, SELECT_POKE_CLOSE); + socket_close(sock); + ISC_LIST_UNLINK(manager->socklist, sock, link); -#ifdef ISC_PLATFORM_USETHREADS if (ISC_LIST_EMPTY(manager->socklist)) SIGNAL(&manager->shutdown_ok); -#endif /* ISC_PLATFORM_USETHREADS */ /* * XXX should reset manager->maxfd here @@ -1291,12 +1503,11 @@ destroy(isc_socket_t **sockp) { static isc_result_t allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type, - isc_socket_t **socketp) -{ + isc_socket_t **socketp) { isc_socket_t *sock; isc_result_t ret; - sock = isc_mem_get(manager->mctx, sizeof(*sock)); + sock = isc_mem_get(manager->mctx, sizeof *sock); if (sock == NULL) return (ISC_R_NOMEMORY); @@ -1308,10 +1519,13 @@ allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type, sock->manager = manager; sock->type = type; - sock->fd = -1; + sock->fd = INVALID_SOCKET; ISC_LINK_INIT(sock, link); + sock->messagehdr = isc_mem_get(manager->mctx, sizeof(struct msghdr)); + memset(sock->messagehdr, 0, sizeof(struct msghdr)); + /* * set up list of readers and writers to be initially empty */ @@ -1319,13 +1533,16 @@ allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type, ISC_LIST_INIT(sock->send_list); ISC_LIST_INIT(sock->accept_list); sock->connect_ev = NULL; - sock->pending_recv = 0; - sock->pending_send = 0; sock->pending_accept = 0; + sock->pending_close = 0; + sock->iocp = 0; sock->listener = 0; sock->connected = 0; sock->connecting = 0; sock->bound = 0; + sock->iEvent = 0; + sock->hEvent = NULL; + sock->wait_type = 0; /* * initialize the lock @@ -1359,13 +1576,12 @@ allocate_socket(isc_socketmgr_t *manager, isc_sockettype_t type, return (ret); } - /* * This event requires that the various lists be empty, that the reference * count be 1, and that the magic number is valid. The other socket bits, * like the lock, must be initialized as well. The fd associated must be - * marked as closed, by setting it to -1 on close, or this routine will - * also close the socket. + * marked as closed, by setting it to INVALID_SOCKET on close, or this + * routine will also close the socket. */ static void free_socket(isc_socket_t **socketp) { @@ -1374,8 +1590,6 @@ free_socket(isc_socket_t **socketp) { INSIST(sock->references == 0); INSIST(VALID_SOCKET(sock)); INSIST(!sock->connecting); - INSIST(!sock->pending_recv); - INSIST(!sock->pending_send); INSIST(!sock->pending_accept); INSIST(ISC_LIST_EMPTY(sock->recv_list)); INSIST(ISC_LIST_EMPTY(sock->send_list)); @@ -1384,6 +1598,8 @@ free_socket(isc_socket_t **socketp) { sock->magic = 0; + isc_mem_put(sock->manager->mctx, sock->messagehdr, sizeof(struct msghdr)); + DESTROYLOCK(&sock->lock); isc_mem_put(sock->manager->mctx, sock, sizeof(*sock)); @@ -1405,7 +1621,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, #if defined(USE_CMSG) || defined(SO_BSDCOMPAT) int on = 1; #endif - int socket_errno = 0; + int socket_errno; char strbuf[ISC_STRERRORSIZE]; REQUIRE(VALID_MANAGER(manager)); @@ -1419,24 +1635,17 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, switch (type) { case isc_sockettype_udp: sock->fd = socket(pf, SOCK_DGRAM, IPPROTO_UDP); + if (connection_reset_fix(sock->fd) != ISC_R_SUCCESS) { + free_socket(&sock); + return (ISC_R_UNEXPECTED); + } break; case isc_sockettype_tcp: sock->fd = socket(pf, SOCK_STREAM, IPPROTO_TCP); break; } - - if (sock->fd >= FD_SETSIZE) { - (void)closesocket(sock->fd); - isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, - ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, - isc_msgcat, ISC_MSGSET_SOCKET, - ISC_MSG_TOOMANYFDS, - "%s: too many open file descriptors", "socket"); - free_socket(&sock); - return (ISC_R_NORESOURCES); - } - if (sock->fd < 0) { + if (sock->fd == INVALID_SOCKET) { socket_errno = WSAGetLastError(); free_socket(&sock); @@ -1468,41 +1677,10 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, return (ISC_R_UNEXPECTED); } -#ifdef SO_BSDCOMPAT - if (setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT, - (void *)&on, sizeof(on)) < 0) { - socket_errno = WSAGetLastError(); - isc__strerror(socket_errno, strbuf, sizeof(strbuf)); - UNEXPECTED_ERROR(__FILE__, __LINE__, - "setsockopt(%d, SO_BSDCOMPAT) %s: %s", - sock->fd, - isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, - ISC_MSG_FAILED, "failed"), - strbuf); - /* Press on... */ - } -#endif #if defined(USE_CMSG) if (type == isc_sockettype_udp) { -#if defined(SO_TIMESTAMP) - if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, - (void *)&on, sizeof(on)) < 0 - && WSAGetLastError() != WSAENOPROTOOPT) { - isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf)); - UNEXPECTED_ERROR(__FILE__, __LINE__, - "setsockopt(%d, SO_TIMESTAMP) %s: %s", - sock->fd, - isc_msgcat_get(isc_msgcat, - ISC_MSGSET_GENERAL, - ISC_MSG_FAILED, - "failed"), - strbuf); - /* Press on... */ - } -#endif /* SO_TIMESTAMP */ - #if defined(ISC_PLATFORM_HAVEIPV6) #ifdef IPV6_RECVPKTINFO /* 2292bis */ @@ -1558,13 +1736,7 @@ isc_socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type, * there are no external references to it yet. */ - manager->fds[sock->fd] = sock; - manager->fdstate[sock->fd] = MANAGED; ISC_LIST_APPEND(manager->socklist, sock, link); - if (manager->maxfd < sock->fd) - manager->maxfd = sock->fd; - manager->minfd = min(manager->minfd, sock->fd); - UNLOCK(&manager->lock); socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, @@ -1609,118 +1781,11 @@ isc_socket_detach(isc_socket_t **socketp) { UNLOCK(&sock->lock); if (kill_socket) - destroy(&sock); + destroy_socket(&sock); *socketp = NULL; } -/* - * I/O is possible on a given socket. Schedule an event to this task that - * will call an internal function to do the I/O. This will charge the - * task with the I/O operation and let our select loop handler get back - * to doing something real as fast as possible. - * - * The socket and manager must be locked before calling this function. - */ -static void -dispatch_recv(isc_socket_t *sock) { - intev_t *iev; - isc_socketevent_t *ev; - - INSIST(!sock->pending_recv); - - ev = ISC_LIST_HEAD(sock->recv_list); - if (ev == NULL) - return; - - sock->pending_recv = 1; - iev = &sock->readable_ev; - - socket_log(sock, NULL, EVENT, NULL, 0, 0, - "dispatch_recv: event %p -> task %p", ev, ev->ev_sender); - - sock->references++; - iev->ev_sender = sock; - iev->ev_action = internal_recv; - iev->ev_arg = sock; - - isc_task_send(ev->ev_sender, (isc_event_t **)&iev); -} - -static void -dispatch_send(isc_socket_t *sock) { - intev_t *iev; - isc_socketevent_t *ev; - - INSIST(!sock->pending_send); - - ev = ISC_LIST_HEAD(sock->send_list); - if (ev == NULL) - return; - - sock->pending_send = 1; - iev = &sock->writable_ev; - - socket_log(sock, NULL, EVENT, NULL, 0, 0, - "dispatch_send: event %p -> task %p", ev, ev->ev_sender); - - sock->references++; - iev->ev_sender = sock; - iev->ev_action = internal_send; - iev->ev_arg = sock; - - isc_task_send(ev->ev_sender, (isc_event_t **)&iev); -} - -/* - * Dispatch an internal accept event. - */ -static void -dispatch_accept(isc_socket_t *sock) { - intev_t *iev; - isc_socket_newconnev_t *ev; - - INSIST(!sock->pending_accept); - - /* - * Are there any done events left, or were they all canceled - * before the manager got the socket lock? - */ - ev = ISC_LIST_HEAD(sock->accept_list); - if (ev == NULL) - return; - - sock->pending_accept = 1; - iev = &sock->readable_ev; - - sock->references++; /* keep socket around for this internal event */ - iev->ev_sender = sock; - iev->ev_action = internal_accept; - iev->ev_arg = sock; - - isc_task_send(ev->ev_sender, (isc_event_t **)&iev); -} - -static void -dispatch_connect(isc_socket_t *sock) { - intev_t *iev; - isc_socket_connev_t *ev; - - iev = &sock->writable_ev; - - ev = sock->connect_ev; - INSIST(ev != NULL); /* XXX */ - - INSIST(sock->connecting); - - sock->references++; /* keep socket around for this internal event */ - iev->ev_sender = sock; - iev->ev_action = internal_connect; - iev->ev_arg = sock; - - isc_task_send(ev->ev_sender, (isc_event_t **)&iev); -} - /* * Dequeue an item off the given socket's read queue, set the result code * in the done event to the one provided, and send it to the task it was @@ -1739,8 +1804,9 @@ send_recvdone_event(isc_socket_t *sock, isc_socketevent_t **dev) { (*dev)->ev_sender = sock; - if (ISC_LINK_LINKED(*dev, ev_link)) + if (ISC_LINK_LINKED(*dev, ev_link)) { ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link); + } if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) == ISC_SOCKEVENTATTR_ATTACHED) @@ -1763,8 +1829,9 @@ send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) { task = (*dev)->ev_sender; (*dev)->ev_sender = sock; - if (ISC_LINK_LINKED(*dev, ev_link)) + if (ISC_LINK_LINKED(*dev, ev_link)) { ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link); + } if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED) == ISC_SOCKEVENTATTR_ATTACHED) @@ -1785,20 +1852,15 @@ send_senddone_event(isc_socket_t *sock, isc_socketevent_t **dev) { * so just unlock and return. */ static void -internal_accept(isc_task_t *me, isc_event_t *ev) { - isc_socket_t *sock; +internal_accept(isc_socket_t *sock, int accept_errno) { isc_socketmgr_t *manager; isc_socket_newconnev_t *dev; isc_task_t *task; ISC_SOCKADDR_LEN_T addrlen; - int fd; + SOCKET fd; isc_result_t result = ISC_R_SUCCESS; - int accept_errno = 0; char strbuf[ISC_STRERRORSIZE]; - UNUSED(me); - - sock = ev->ev_sender; INSIST(VALID_SOCKET(sock)); LOCK(&sock->lock); @@ -1810,6 +1872,7 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { INSIST(VALID_MANAGER(manager)); INSIST(sock->listener); + INSIST(sock->iEvent); INSIST(sock->pending_accept == 1); sock->pending_accept = 0; @@ -1817,7 +1880,7 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { sock->references--; /* the internal event is done with this socket */ if (sock->references == 0) { UNLOCK(&sock->lock); - destroy(&sock); + destroy_socket(&sock); return; } @@ -1836,11 +1899,11 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { * EAGAIN or EINTR, simply poke the watcher to watch this socket * again. */ - addrlen = sizeof(dev->newsocket->address.type); + addrlen = sizeof dev->newsocket->address.type; memset(&dev->newsocket->address.type.sa, 0, addrlen); fd = accept(sock->fd, &dev->newsocket->address.type.sa, (void *)&addrlen); - if (fd < 0) { + if (fd == INVALID_SOCKET) { accept_errno = WSAGetLastError(); if (SOFT_ERROR(accept_errno) || accept_errno == WSAECONNRESET) { goto soft_error; @@ -1853,7 +1916,7 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { ISC_MSG_FAILED, "failed"), strbuf); - fd = -1; + fd = INVALID_SOCKET; result = ISC_R_UNEXPECTED; } } else { @@ -1877,19 +1940,10 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { sock->pf); (void)closesocket(fd); goto soft_error; - } else if (fd >= FD_SETSIZE) { - isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL, - ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR, - isc_msgcat, ISC_MSGSET_SOCKET, - ISC_MSG_TOOMANYFDS, - "%s: too many open file descriptors", - "accept"); - (void)closesocket(fd); - goto soft_error; } } - if (fd != -1) { + if (fd != INVALID_SOCKET) { dev->newsocket->address.length = addrlen; dev->newsocket->pf = sock->pf; } @@ -1899,24 +1953,18 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { */ ISC_LIST_UNLINK(sock->accept_list, dev, ev_link); - /* - * Poke watcher if there are more pending accepts. - */ - if (!ISC_LIST_EMPTY(sock->accept_list)) - select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); - UNLOCK(&sock->lock); - if (fd != -1 && (make_nonblock(fd) != ISC_R_SUCCESS)) { + if (fd != INVALID_SOCKET && (make_nonblock(fd) != ISC_R_SUCCESS)) { closesocket(fd); - fd = -1; + fd = INVALID_SOCKET; result = ISC_R_UNEXPECTED; } /* - * -1 means the new socket didn't happen. + * INVALID_SOCKET means the new socket didn't happen. */ - if (fd != -1) { + if (fd != INVALID_SOCKET) { LOCK(&manager->lock); ISC_LIST_APPEND(manager->socklist, dev->newsocket, link); @@ -1929,18 +1977,12 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { */ dev->address = dev->newsocket->address; - manager->fds[fd] = dev->newsocket; - manager->fdstate[fd] = MANAGED; - if (manager->maxfd < fd) - manager->maxfd = fd; - manager->minfd = min(manager->minfd, sock->fd); - socket_log(sock, &dev->newsocket->address, CREATION, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN, "accepted connection, new socket %p", dev->newsocket); - UNLOCK(&manager->lock); + UNLOCK(&manager->lock); } else { dev->newsocket->references--; free_socket(&dev->newsocket); @@ -1957,46 +1999,49 @@ internal_accept(isc_task_t *me, isc_event_t *ev) { return; soft_error: - select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT); UNLOCK(&sock->lock); return; } static void -internal_recv(isc_task_t *me, isc_event_t *ev) { - isc_socketevent_t *dev; - isc_socket_t *sock; +internal_recv(isc_socket_t *sock, isc_socketevent_t *dev, int nbytes, int recv_errno) { + isc_socketevent_t *ldev; + int io_state; + int cc; - INSIST(ev->ev_type == ISC_SOCKEVENT_INTR); - - sock = ev->ev_sender; INSIST(VALID_SOCKET(sock)); LOCK(&sock->lock); socket_log(sock, NULL, IOEVENT, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV, - "internal_recv: task %p got event %p", me, ev); - - INSIST(sock->pending_recv == 1); - sock->pending_recv = 0; + "internal_recv: task got socket event %p", dev); INSIST(sock->references > 0); sock->references--; /* the internal event is done with this socket */ if (sock->references == 0) { UNLOCK(&sock->lock); - destroy(&sock); + destroy_socket(&sock); return; } + /* If the event is no longer in the list we can just return */ + ldev = ISC_LIST_HEAD(sock->recv_list); + while (ldev != NULL && ldev != dev) { + ldev = ISC_LIST_NEXT(ldev, ev_link); + } + if (ldev == NULL) + goto done; + /* * Try to do as much I/O as possible on this socket. There are no * limits here, currently. */ - dev = ISC_LIST_HEAD(sock->recv_list); - while (dev != NULL) { - switch (doio_recv(sock, dev)) { + switch (completeio_recv(sock, dev, nbytes, recv_errno)) { case DOIO_SOFT: - goto poke; + cc = 0; + recv_errno = 0; + io_state = startio_recv(sock, dev, &cc, FALSE, &recv_errno); + goto done; case DOIO_EOF: /* @@ -2004,325 +2049,291 @@ internal_recv(isc_task_t *me, isc_event_t *ev) { * Run through the event queue and dispatch all * the events with an EOF result code. */ - do { - dev->result = ISC_R_EOF; - send_recvdone_event(sock, &dev); - dev = ISC_LIST_HEAD(sock->recv_list); - } while (dev != NULL); - goto poke; + dev->result = ISC_R_EOF; + send_recvdone_event(sock, &dev); + goto done; case DOIO_SUCCESS: case DOIO_HARD: send_recvdone_event(sock, &dev); break; } - - dev = ISC_LIST_HEAD(sock->recv_list); - } - - poke: - if (!ISC_LIST_EMPTY(sock->recv_list)) - select_poke(sock->manager, sock->fd, SELECT_POKE_READ); - + done: UNLOCK(&sock->lock); } static void -internal_send(isc_task_t *me, isc_event_t *ev) { - isc_socketevent_t *dev; - isc_socket_t *sock; - - INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); +internal_send(isc_socket_t *sock, isc_socketevent_t *dev, int nbytes, int send_errno) { + isc_socketevent_t *ldev; + int io_state; + int cc; /* * Find out what socket this is and lock it. */ - sock = (isc_socket_t *)ev->ev_sender; INSIST(VALID_SOCKET(sock)); LOCK(&sock->lock); socket_log(sock, NULL, IOEVENT, isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND, - "internal_send: task %p got event %p", me, ev); - - INSIST(sock->pending_send == 1); - sock->pending_send = 0; + "internal_send: task got socket event %p", dev); INSIST(sock->references > 0); sock->references--; /* the internal event is done with this socket */ if (sock->references == 0) { UNLOCK(&sock->lock); - destroy(&sock); + destroy_socket(&sock); return; } + /* If the event is no longer in the list we can just return */ + ldev = ISC_LIST_HEAD(sock->send_list); + while (ldev != NULL && ldev != dev) { + ldev = ISC_LIST_NEXT(ldev, ev_link); + } + if (ldev == NULL) + goto done; /* * Try to do as much I/O as possible on this socket. There are no * limits here, currently. */ - dev = ISC_LIST_HEAD(sock->send_list); - while (dev != NULL) { - switch (doio_send(sock, dev)) { - case DOIO_SOFT: - goto poke; + switch (completeio_send(sock, dev, nbytes, send_errno)) { + case DOIO_SOFT: + cc = 0; + send_errno = 0; + io_state = startio_send(sock, dev, &cc, FALSE, &send_errno); + goto done; - case DOIO_HARD: - case DOIO_SUCCESS: - send_senddone_event(sock, &dev); - break; - } - - dev = ISC_LIST_HEAD(sock->send_list); + case DOIO_HARD: + case DOIO_SUCCESS: + send_senddone_event(sock, &dev); + break; } - poke: - if (!ISC_LIST_EMPTY(sock->send_list)) - select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE); + done: UNLOCK(&sock->lock); } -static void -process_fds(isc_socketmgr_t *manager, int maxfd, int minfd, - fd_set *readfds, fd_set *writefds, fd_set *exceptfds) -{ - int i; - isc_socket_t *sock; - isc_boolean_t unlock_sock; - BOOL conn_check = FALSE; +static isc_threadresult_t WINAPI +SocketIoThread(LPVOID ThreadContext) { + isc_socketmgr_t *manager = ThreadContext; + BOOL bSuccess = FALSE; + DWORD nbytes; + DWORD tbytes; + DWORD tflags; + IoCompletionInfo *lpo = NULL; + isc_socket_t *sock = NULL; + int request; + isc_socketevent_t *dev = NULL; + int errval; + char strbuf[ISC_STRERRORSIZE]; + int errstatus; + + /* Set the thread priority high enough so I/O will + * preempt normal recv packet processing, but not + * higher than the timer sync thread. + */ + if (!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_ABOVE_NORMAL)) { + errval = GetLastError(); + isc__strerror(errval, strbuf, sizeof(strbuf)); + FATAL_ERROR(__FILE__, __LINE__, + isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, + ISC_MSG_FAILED, + "Can't set thread priority: %s"), + strbuf); + } - REQUIRE(maxfd <= FD_SETSIZE); /* - * Process read/writes on other fds here. Avoid locking - * and unlocking twice if both reads and writes are possible. + * Loop forever waiting on I/O Completions and then processing them */ - for (i = minfd ; i <= maxfd ; i++) { - if (manager->fdstate[i] == CLOSE_PENDING) { - manager->fdstate[i] = CLOSED; - FD_CLR(i, &manager->read_fds); - FD_CLR(i, &manager->write_fds); - FD_CLR(i, &manager->except_fds); + while(TRUE) { + bSuccess = GetQueuedCompletionStatus ( + manager->hIoCompletionPort, + &nbytes, + (LPDWORD) &sock, + (LPOVERLAPPED *)&lpo, + INFINITE + ); + if(lpo == NULL ) { + /* + * Received request to exit + */ + break; + } + errstatus = 0; + if(!bSuccess) { + /* + * I/O Failure + * Find out why + */ + WSAGetOverlappedResult(sock->fd, (LPWSAOVERLAPPED) &lpo, + &tbytes, FALSE, &tflags); + errstatus = WSAGetLastError(); + isc__strerror(errstatus, strbuf, sizeof(strbuf)); + dev = lpo->dev; + } - closesocket(i); + request = lpo->request_type; + dev = lpo->dev; + switch (request) { + case SOCKET_CANCEL: + break; + case SOCKET_RECV: + internal_recv(sock, dev, nbytes, errstatus); + break; + case SOCKET_SEND: + internal_send(sock, dev, nbytes, errstatus); + break; + default: + break; /* Unknown: Just ignore it */ + } + if (lpo != NULL) + HeapFree(hHeapHandle, 0, lpo); + } + /* + * Exit Completion Port Thread + */ + manager_log(manager, TRACE, + isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, + ISC_MSG_EXITING, "SocketIoThread exiting")); + return ((isc_threadresult_t)0); +} +/* + * This is the thread that will loop forever, waiting for an event to + * happen. + * + * When the wait returns something to do, find the signaled event + * and issue the request for the given socket + */ +static isc_threadresult_t WINAPI +event_wait(void *uap) { + isc_socketmgr_t *manager = uap; + int cc; + int event_errno; + char strbuf[ISC_STRERRORSIZE]; + isc_socket_t *wsock; + int iEvent; + int max_event; + sock_event_list *evlist; + WSANETWORKEVENTS NetworkEvents; + int err; + + evlist = &(manager->sockev_list); + + /* + * Get the Event Lock here. + */ + LOCK(&evlist->EventLock); + + while (!manager->bShutdown) { + do { + + max_event = evlist->max_event; + event_errno = 0; + + WSAResetEvent(evlist->aEventList[0]); + UNLOCK(&evlist->EventLock); + cc = WSAWaitForMultipleEvents(max_event, + evlist->aEventList,FALSE, WSA_INFINITE, + FALSE); + if (cc == WSA_WAIT_FAILED) { + event_errno = WSAGetLastError(); + if (!SOFT_ERROR(event_errno)) { + isc__strerror(event_errno, strbuf, + sizeof(strbuf)); + FATAL_ERROR(__FILE__, __LINE__, + "WSAWaitForMultipleEvents() %s: %s", + isc_msgcat_get(isc_msgcat, + ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, + "failed"), + strbuf); + } + } + + LOCK(&evlist->EventLock); + } while (cc < 0 && !manager->bShutdown + && manager->event_written == 0); + + + if (manager->bShutdown) + break; + + iEvent = cc - WSA_WAIT_EVENT_0; + + /* + * Add or delete events as requested + */ + if (manager->event_written > 0) + process_eventlist(evlist, manager); + /* + * Stopped to add and delete events on the list + */ + if(iEvent == 0) + continue; + + wsock = evlist->aSockList[iEvent]; + if (wsock == NULL) + continue; + + if (WSAEnumNetworkEvents( wsock->fd, 0, + &NetworkEvents) == SOCKET_ERROR) + err = WSAGetLastError(); + + if(NetworkEvents.lNetworkEvents == 0 ) { + WSAResetEvent(wsock->hEvent); continue; } - sock = manager->fds[i]; - unlock_sock = ISC_FALSE; - if (FD_ISSET(i, readfds)) { - if (sock == NULL) { - FD_CLR(i, &manager->read_fds); - goto check_write; - } - unlock_sock = ISC_TRUE; - LOCK(&sock->lock); - if (!SOCK_DEAD(sock)) { - if (sock->listener) - dispatch_accept(sock); - else - dispatch_recv(sock); - } - FD_CLR(i, &manager->read_fds); - } - check_write: - if (FD_ISSET(i, writefds)) { - if (sock == NULL) { - FD_CLR(i, &manager->write_fds); - continue; - } - if (!unlock_sock) { - unlock_sock = ISC_TRUE; - LOCK(&sock->lock); - } - if (!SOCK_DEAD(sock)) { - if (sock->connecting) { - dispatch_connect(sock); - conn_check = TRUE; - } - else - dispatch_send(sock); - } - FD_CLR(i, &manager->write_fds); - FD_CLR(i, &manager->except_fds); - } - if (FD_ISSET(i, exceptfds)) { - if (sock == NULL) { - FD_CLR(i, &manager->except_fds); - continue; - } - if (!unlock_sock) { - unlock_sock = ISC_TRUE; - LOCK(&sock->lock); - } - if (!SOCK_DEAD(sock)) { - if (sock->connecting) { - dispatch_connect(sock); - conn_check = TRUE; - } - } - FD_CLR(i, &manager->write_fds); - FD_CLR(i, &manager->except_fds); + if(NetworkEvents.lNetworkEvents & FD_CLOSE) { + WSAResetEvent(wsock->hEvent); + continue; } - if (unlock_sock) - UNLOCK(&sock->lock); - } -} - -#ifdef ISC_PLATFORM_USETHREADS -/* - * This is the thread that will loop forever, always in a select or poll - * call. - * - * When select returns something to do, track down what thread gets to do - * this I/O and post the event to it. - */ -static isc_threadresult_t WINAPI -watcher(void *uap) { - isc_socketmgr_t *manager = uap; - isc_boolean_t done; - int ctlfd; - int cc; - fd_set readfds; - fd_set writefds; - fd_set exceptfds; - int msg, fd; - int maxfd; - int minfd; - int watcher_errno = 0; - char strbuf[ISC_STRERRORSIZE]; - - /* Timeout on select in case it's necesasary */ - struct timeval tv; - tv.tv_sec = MAX_SELECT_SECONDS; - tv.tv_usec = MAX_SELECT_MILLISECONDS; - - /* - * Get the control fd here. This will never change. - */ - LOCK(&manager->lock); - ctlfd = manager->pipe_fds[0]; - - done = ISC_FALSE; - while (!done) { - do { - readfds = manager->read_fds; - writefds = manager->write_fds; - exceptfds = manager->except_fds; - maxfd = manager->maxfd; /* Pipe not included */ - minfd = manager->minfd; - watcher_errno = 0; - - UNLOCK(&manager->lock); - - if(maxfd > 0) { - cc = select(maxfd, &readfds, &writefds, - &exceptfds, &tv); - if (cc < 0 && bpipe_written <= 0) { - watcher_errno = WSAGetLastError(); - if (!SOFT_ERROR(watcher_errno) && - watcher_errno != WSAEINVAL) { - isc__strerror(watcher_errno, - strbuf, sizeof(strbuf)); - FATAL_ERROR(__FILE__, __LINE__, - "select() %s: %s", - isc_msgcat_get(isc_msgcat, - ISC_MSGSET_GENERAL, - ISC_MSG_FAILED, - "failed"), - strbuf); - } - } - - } else { - /* - * Nothing to select on yet, so sleep - * and check again for work - */ - Sleep(MAX_SELECT_SECONDS*1000 - + MAX_SELECT_MILLISECONDS); + if (wsock->references > 0 && wsock->pending_close == 0) { + if (wsock->listener == 1 && + wsock->pending_accept == 0) { + wsock->pending_accept = 1; + wsock->references++; + internal_accept(wsock, event_errno); + } + else { + wsock->references++; + internal_connect(wsock, event_errno); } - - LOCK(&manager->lock); - } while (cc < 0 && bpipe_written <= 0); - - - /* - * Process reads on internal, control fd. - */ - while (bpipe_written > 0) { - select_readmsg(manager, &fd, &msg); - InterlockedDecrement(&bpipe_written); - - manager_log(manager, IOEVENT, - isc_msgcat_get(isc_msgcat, - ISC_MSGSET_SOCKET, - ISC_MSG_WATCHERMSG, - "watcher got message %d"), - msg); - - /* - * Nothing to read? - */ - if (msg == SELECT_POKE_NOTHING) - break; - - /* - * Handle shutdown message. We really should - * jump out of this loop right away, but - * it doesn't matter if we have to do a little - * more work first. - */ - if (msg == SELECT_POKE_SHUTDOWN) { - done = ISC_TRUE; - - break; - } } - process_fds(manager, maxfd, minfd, &readfds, &writefds, &exceptfds); + if (wsock->hEvent != NULL) + WSAResetEvent(wsock->hEvent); } manager_log(manager, TRACE, isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, - ISC_MSG_EXITING, "watcher exiting")); + ISC_MSG_EXITING, "event_wait exiting")); - UNLOCK(&manager->lock); + UNLOCK(&evlist->EventLock); return ((isc_threadresult_t)0); } -#endif /* ISC_PLATFORM_USETHREADS */ - /* * Create a new socket manager. */ isc_result_t isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { isc_socketmgr_t *manager; -#ifdef ISC_PLATFORM_USETHREADS - char strbuf[ISC_STRERRORSIZE]; -#endif REQUIRE(managerp != NULL && *managerp == NULL); -#ifndef ISC_PLATFORM_USETHREADS - if (socketmgr != NULL) { - socketmgr->refs++; - *managerp = socketmgr; - return (ISC_R_SUCCESS); - } -#endif /* ISC_PLATFORM_USETHREADS */ - - manager = isc_mem_get(mctx, sizeof(*manager)); + manager = isc_mem_get(mctx, sizeof *manager); if (manager == NULL) return (ISC_R_NOMEMORY); manager->magic = SOCKET_MANAGER_MAGIC; manager->mctx = NULL; - memset(manager->fds, 0, sizeof(manager->fds)); ISC_LIST_INIT(manager->socklist); if (isc_mutex_init(&manager->lock) != ISC_R_SUCCESS) { - isc_mem_put(mctx, manager, sizeof(*manager)); + isc_mem_put(mctx, manager, sizeof *manager); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_mutex_init() %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, @@ -2331,7 +2342,7 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { } if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) { DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); + isc_mem_put(mctx, manager, sizeof *manager); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_condition_init() %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, @@ -2339,47 +2350,23 @@ isc_socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) { return (ISC_R_UNEXPECTED); } - /* - * Create the special fds that will be used to wake up the - * select/poll loop when something internal needs to be done. - */ - if (internal_pipe(manager->pipe_fds) != 0) { - DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); - isc__strerror(errno, strbuf, sizeof(strbuf)); - UNEXPECTED_ERROR(__FILE__, __LINE__, - "pipe() %s: %s", - isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, - ISC_MSG_FAILED, "failed"), - strbuf); + iocompletionport_init(manager); /* Create the Completion Ports */ - return (ISC_R_UNEXPECTED); - } - - - /* - * Set up initial state for the select loop - */ - FD_ZERO(&manager->read_fds); - FD_ZERO(&manager->write_fds); - FD_ZERO(&manager->except_fds); - manager->maxfd = 0; - manager->minfd = FD_SETSIZE; - memset(manager->fdstate, 0, sizeof(manager->fdstate)); + socket_event_minit(&manager->sockev_list); + manager->event_written = 0; + manager->bShutdown = ISC_FALSE; /* * Start up the select/poll thread. */ - if (isc_thread_create(watcher, manager, &manager->watcher) != + if (isc_thread_create(event_wait, manager, &manager->watcher) != ISC_R_SUCCESS) { DESTROYLOCK(&manager->lock); - isc_mem_put(mctx, manager, sizeof(*manager)); + isc_mem_put(mctx, manager, sizeof *manager); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_thread_create() %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed")); - _close(manager->pipe_fds[0]); - _close(manager->pipe_fds[1]); return (ISC_R_UNEXPECTED); } isc_mem_attach(mctx, &manager->mctx); @@ -2403,17 +2390,8 @@ isc_socketmgr_destroy(isc_socketmgr_t **managerp) { manager = *managerp; REQUIRE(VALID_MANAGER(manager)); -#ifndef ISC_PLATFORM_USETHREADS - if (manager->refs > 1) { - manager->refs--; - *managerp = NULL; - return; - } -#endif /* ISC_PLATFORM_USETHREADS */ - LOCK(&manager->lock); -#ifdef ISC_PLATFORM_USETHREADS /* * Wait for all sockets to be destroyed. */ @@ -2424,56 +2402,49 @@ isc_socketmgr_destroy(isc_socketmgr_t **managerp) { "sockets exist")); WAIT(&manager->shutdown_ok, &manager->lock); } -#else /* ISC_PLATFORM_USETHREADS */ - /* - * Hope all sockets have been destroyed. - */ - if (!ISC_LIST_EMPTY(manager->socklist)) { - manager_log(manager, CREATION, - isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET, - ISC_MSG_SOCKETSREMAIN, - "sockets exist")); - INSIST(0); - } -#endif /* ISC_PLATFORM_USETHREADS */ UNLOCK(&manager->lock); /* - * Here, poke our select/poll thread. Do this by closing the write - * half of the pipe, which will send EOF to the read half. - * This is currently a no-op in the non-threaded case. + * Here, we need to had some wait code for the completion port + * thread. */ - select_poke(manager, 0, SELECT_POKE_SHUTDOWN); + signal_iocompletionport_exit(manager); + manager->bShutdown = ISC_TRUE; + WSASetEvent(manager->sockev_list.aEventList[0]); -#ifdef ISC_PLATFORM_USETHREADS /* * Wait for thread to exit. */ + if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS) UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_thread_join() %s", isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, ISC_MSG_FAILED, "failed")); -#endif /* ISC_PLATFORM_USETHREADS */ - + /* + * Now the I/O Completion Port Worker Threads + */ + for (i = 0; i < manager->maxIOCPThreads; i++) { + if (isc_thread_join((isc_thread_t) manager->hIOCPThreads[i], NULL) + != ISC_R_SUCCESS) + UNEXPECTED_ERROR(__FILE__, __LINE__, + "isc_thread_join() for Completion Port %s", + isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, "failed")); + } /* * Clean up. */ -#ifdef ISC_PLATFORM_USETHREADS - _close(manager->pipe_fds[0]); - _close(manager->pipe_fds[1]); - (void)isc_condition_destroy(&manager->shutdown_ok); -#endif /* ISC_PLATFORM_USETHREADS */ - for (i = 0 ; i < FD_SETSIZE ; i++) - if (manager->fdstate[i] == CLOSE_PENDING) - closesocket(i); + CloseHandle(manager->hIoCompletionPort); + + (void)isc_condition_destroy(&manager->shutdown_ok); DESTROYLOCK(&manager->lock); manager->magic = 0; mctx= manager->mctx; - isc_mem_put(mctx, manager, sizeof(*manager)); + isc_mem_put(mctx, manager, sizeof *manager); isc_mem_detach(&mctx); @@ -2482,26 +2453,18 @@ isc_socketmgr_destroy(isc_socketmgr_t **managerp) { static isc_result_t socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, - unsigned int flags) -{ + unsigned int flags) { int io_state; - isc_boolean_t have_lock = ISC_FALSE; + int cc = 0; isc_task_t *ntask = NULL; isc_result_t result = ISC_R_SUCCESS; + int recv_errno = 0; dev->ev_sender = task; - if (sock->type == isc_sockettype_udp) { - io_state = doio_recv(sock, dev); - } else { - LOCK(&sock->lock); - have_lock = ISC_TRUE; - - if (ISC_LIST_EMPTY(sock->recv_list)) - io_state = doio_recv(sock, dev); - else - io_state = DOIO_SOFT; - } + LOCK(&sock->lock); + iocompletionport_update(sock); + io_state = startio_recv(sock, dev, &cc, FALSE, &recv_errno); switch (io_state) { case DOIO_SOFT: @@ -2514,17 +2477,9 @@ socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, isc_task_attach(task, &ntask); dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; - if (!have_lock) { - LOCK(&sock->lock); - have_lock = ISC_TRUE; - } - /* - * Enqueue the request. If the socket was previously not being - * watched, poke the watcher to start paying attention to it. + * Enqueue the request. */ - if (ISC_LIST_EMPTY(sock->recv_list)) - select_poke(sock->manager, sock->fd, SELECT_POKE_READ); ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link); socket_log(sock, NULL, EVENT, NULL, 0, 0, @@ -2545,9 +2500,7 @@ socket_recv(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, send_recvdone_event(sock, &dev); break; } - - if (have_lock) - UNLOCK(&sock->lock); + UNLOCK(&sock->lock); return (result); } @@ -2662,6 +2615,8 @@ socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, unsigned int flags) { int io_state; + int send_errno = 0; + int cc = 0; isc_boolean_t have_lock = ISC_FALSE; isc_task_t *ntask = NULL; isc_result_t result = ISC_R_SUCCESS; @@ -2684,17 +2639,10 @@ socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, dev->pktinfo.ipi6_ifindex = 0; } - if (sock->type == isc_sockettype_udp) - io_state = doio_send(sock, dev); - else { - LOCK(&sock->lock); - have_lock = ISC_TRUE; - - if (ISC_LIST_EMPTY(sock->send_list)) - io_state = doio_send(sock, dev); - else - io_state = DOIO_SOFT; - } + LOCK(&sock->lock); + have_lock = ISC_TRUE; + iocompletionport_update(sock); + io_state = startio_send(sock, dev, &cc, FALSE, &send_errno); switch (io_state) { case DOIO_SOFT: @@ -2702,7 +2650,7 @@ socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, * We couldn't send all or part of the request right now, so * queue it unless ISC_SOCKFLAG_NORETRY is set. */ - if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { +// if ((flags & ISC_SOCKFLAG_NORETRY) == 0) { isc_task_attach(task, &ntask); dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED; @@ -2712,12 +2660,8 @@ socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, } /* - * Enqueue the request. If the socket was previously not being - * watched, poke the watcher to start paying attention to it. + * Enqueue the request. */ - if (ISC_LIST_EMPTY(sock->send_list)) - select_poke(sock->manager, sock->fd, - SELECT_POKE_WRITE); ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link); socket_log(sock, NULL, EVENT, NULL, 0, 0, @@ -2727,12 +2671,10 @@ socket_send(isc_socket_t *sock, isc_socketevent_t *dev, isc_task_t *task, if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0) result = ISC_R_INPROGRESS; break; - } +// } - case DOIO_HARD: case DOIO_SUCCESS: - if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0) - send_senddone_event(sock, &dev); +// result = ISC_R_INPROGRESS; break; } @@ -2775,7 +2717,6 @@ isc_socket_sendto(isc_socket_t *sock, isc_region_t *region, if (dev == NULL) { return (ISC_R_NOMEMORY); } - dev->region = *region; return (socket_send(sock, dev, task, address, pktinfo, 0)); @@ -2851,7 +2792,7 @@ isc_socket_sendto2(isc_socket_t *sock, isc_region_t *region, isc_result_t isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr) { - int bind_errno = 0; + int bind_errno; char strbuf[ISC_STRERRORSIZE]; int on = 1; @@ -2864,7 +2805,7 @@ isc_socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr) { return (ISC_R_FAMILYMISMATCH); } if (setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on, - sizeof(on)) < 0) { + sizeof on) < 0) { UNEXPECTED_ERROR(__FILE__, __LINE__, "setsockopt(%d) %s", sock->fd, isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, @@ -2921,6 +2862,7 @@ isc_socket_filter(isc_socket_t *sock, const char *filter) { isc_result_t isc_socket_listen(isc_socket_t *sock, unsigned int backlog) { char strbuf[ISC_STRERRORSIZE]; + isc_result_t retstat; REQUIRE(VALID_SOCKET(sock)); @@ -2944,6 +2886,20 @@ isc_socket_listen(isc_socket_t *sock, unsigned int backlog) { sock->listener = 1; + /* Add the socket to the list of events to accept */ + retstat = socket_event_add(sock, FD_ACCEPT | FD_CLOSE); + if (retstat != ISC_R_SUCCESS) { + UNLOCK(&sock->lock); + if (retstat != ISC_R_NOSPACE) { + isc__strerror(WSAGetLastError(), strbuf, + sizeof(strbuf)); + UNEXPECTED_ERROR(__FILE__, __LINE__, + "isc_socket_listen: socket_event_add: %s", strbuf); + } + return (retstat); + } + + UNLOCK(&sock->lock); return (ISC_R_SUCCESS); } @@ -2960,7 +2916,6 @@ isc_socket_accept(isc_socket_t *sock, isc_task_t *ntask = NULL; isc_socket_t *nsock; isc_result_t ret; - isc_boolean_t do_poke = ISC_FALSE; REQUIRE(VALID_SOCKET(sock)); manager = sock->manager; @@ -3001,18 +2956,10 @@ isc_socket_accept(isc_socket_t *sock, dev->newsocket = nsock; /* - * Poke watcher here. We still have the socket locked, so there - * is no race condition. We will keep the lock for such a short - * bit of time waking it up now or later won't matter all that much. + * Enqueue the event */ - if (ISC_LIST_EMPTY(sock->accept_list)) - do_poke = ISC_TRUE; - ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link); - if (do_poke) - select_poke(manager, sock->fd, SELECT_POKE_ACCEPT); - UNLOCK(&sock->lock); return (ISC_R_SUCCESS); } @@ -3025,6 +2972,7 @@ isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, isc_task_t *ntask = NULL; isc_socketmgr_t *manager; int cc; + int retstat; int errval; char strbuf[ISC_STRERRORSIZE]; @@ -3077,7 +3025,6 @@ isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, ERROR_MATCH(WSAENOBUFS, ISC_R_NORESOURCES); ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); - ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET); #undef ERROR_MATCH } @@ -3123,14 +3070,21 @@ isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, dev->ev_sender = ntask; /* - * Poke watcher here. We still have the socket locked, so there - * is no race condition. We will keep the lock for such a short - * bit of time waking it up now or later won't matter all that much. + * Enqueue the request. */ - if (sock->connect_ev == NULL) - select_poke(manager, sock->fd, SELECT_POKE_CONNECT); - sock->connect_ev = dev; + /* Add the socket to the list of events to connect */ + retstat = socket_event_add(sock, FD_CONNECT | FD_CLOSE); + if (retstat != ISC_R_SUCCESS) { + UNLOCK(&sock->lock); + if (retstat != ISC_R_NOSPACE) { + isc__strerror(WSAGetLastError(), strbuf, + sizeof(strbuf)); + UNEXPECTED_ERROR(__FILE__, __LINE__, + "isc_socket_connect: socket_event_add: %s", strbuf); + } + return (retstat); + } UNLOCK(&sock->lock); return (ISC_R_SUCCESS); @@ -3140,22 +3094,17 @@ isc_socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr, * Called when a socket with a pending connect() finishes. */ static void -internal_connect(isc_task_t *me, isc_event_t *ev) { - isc_socket_t *sock; +internal_connect(isc_socket_t *sock, int connect_errno) { isc_socket_connev_t *dev; isc_task_t *task; int cc; ISC_SOCKADDR_LEN_T optlen; - int connect_errno = 0; char strbuf[ISC_STRERRORSIZE]; - UNUSED(me); - INSIST(ev->ev_type == ISC_SOCKEVENT_INTW); - - sock = ev->ev_sender; INSIST(VALID_SOCKET(sock)); LOCK(&sock->lock); + WSAResetEvent(sock->hEvent); /* * When the internal event was sent the reference count was bumped @@ -3165,7 +3114,7 @@ internal_connect(isc_task_t *me, isc_event_t *ev) { sock->references--; if (sock->references == 0) { UNLOCK(&sock->lock); - destroy(&sock); + destroy_socket(&sock); return; } @@ -3189,16 +3138,16 @@ internal_connect(isc_task_t *me, isc_event_t *ev) { if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void *)&cc, (void *)&optlen) < 0) connect_errno = WSAGetLastError(); + else + connect_errno = 0; if (connect_errno != 0) { /* - * If the error is WSAEAGAIN, just re-select on this + * If the error is EAGAIN, just try again on this * fd and pretend nothing strange happened. */ if (SOFT_ERROR(connect_errno) || connect_errno == WSAEINPROGRESS) { sock->connecting = 1; - select_poke(sock->manager, sock->fd, - SELECT_POKE_CONNECT); UNLOCK(&sock->lock); return; @@ -3220,7 +3169,6 @@ internal_connect(isc_task_t *me, isc_event_t *ev) { ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH); ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED); ERROR_MATCH(WSAETIMEDOUT, ISC_R_TIMEDOUT); - ERROR_MATCH(WSAECONNRESET, ISC_R_CONNECTIONRESET); #undef ERROR_MATCH default: dev->result = ISC_R_UNEXPECTED; @@ -3283,7 +3231,7 @@ isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) { ret = ISC_R_SUCCESS; - len = sizeof(addressp->type); + len = sizeof addressp->type; if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) { isc__strerror(WSAGetLastError(), strbuf, sizeof(strbuf)); UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s", @@ -3305,6 +3253,7 @@ isc_socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp) { */ void isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) { +// IoCompletionInfo *CancelRequest; REQUIRE(VALID_SOCKET(sock)); @@ -3338,11 +3287,18 @@ isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) { while (dev != NULL) { current_task = dev->ev_sender; next = ISC_LIST_NEXT(dev, ev_link); - if ((task == NULL) || (task == current_task)) { dev->result = ISC_R_CANCELED; send_recvdone_event(sock, &dev); } +/* if (sock->iocp == 1) { + CancelRequest = (IoCompletionInfo *) HeapAlloc(hHeapHandle, + HEAP_ZERO_MEMORY, sizeof(IoCompletionInfo)); + CancelRequest->request_type = SOCKET_CANCEL; + CancelRequest->dev = dev; + PostQueuedCompletionStatus(hIoCompletionPort, 0, + (DWORD) sock, &CancelRequest->overlapped); + } */ dev = next; } } @@ -3358,11 +3314,18 @@ isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) { while (dev != NULL) { current_task = dev->ev_sender; next = ISC_LIST_NEXT(dev, ev_link); - if ((task == NULL) || (task == current_task)) { dev->result = ISC_R_CANCELED; send_senddone_event(sock, &dev); } +/* if (sock->iocp == 1) { + CancelRequest = (IoCompletionInfo *) HeapAlloc(hHeapHandle, + HEAP_ZERO_MEMORY, sizeof(IoCompletionInfo)); + CancelRequest->request_type = SOCKET_CANCEL; + CancelRequest->dev = dev; + PostQueuedCompletionStatus(hIoCompletionPort, 0, + (DWORD) sock, &CancelRequest->overlapped); + } */ dev = next; } } @@ -3374,6 +3337,8 @@ isc_socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how) { isc_task_t *current_task; dev = ISC_LIST_HEAD(sock->accept_list); + socket_event_delete(sock); + while (dev != NULL) { current_task = dev->ev_sender; next = ISC_LIST_NEXT(dev, ev_link); @@ -3440,3 +3405,22 @@ isc_socket_isbound(isc_socket_t *sock) { return (val); } + +void +isc_socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes) { +#if defined(IPV6_V6ONLY) + int onoff = yes ? 1 : 0; +#else + UNUSED(yes); + UNUSED(sock); +#endif + + REQUIRE(VALID_SOCKET(sock)); + +#ifdef IPV6_V6ONLY + if (sock->pf == AF_INET6) { + (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY, + (void *)&onoff, sizeof(onoff)); + } +#endif +}