1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package poll
6
7import (
8	"errors"
9	"internal/race"
10	"internal/syscall/windows"
11	"io"
12	"sync"
13	"syscall"
14	"unicode/utf16"
15	"unicode/utf8"
16	"unsafe"
17)
18
19var (
20	initErr error
21	ioSync  uint64
22)
23
24// This package uses the SetFileCompletionNotificationModes Windows
25// API to skip calling GetQueuedCompletionStatus if an IO operation
26// completes synchronously. There is a known bug where
27// SetFileCompletionNotificationModes crashes on some systems (see
28// https://support.microsoft.com/kb/2568167 for details).
29
30var useSetFileCompletionNotificationModes bool // determines is SetFileCompletionNotificationModes is present and safe to use
31
32// checkSetFileCompletionNotificationModes verifies that
33// SetFileCompletionNotificationModes Windows API is present
34// on the system and is safe to use.
35// See https://support.microsoft.com/kb/2568167 for details.
36func checkSetFileCompletionNotificationModes() {
37	err := syscall.LoadSetFileCompletionNotificationModes()
38	if err != nil {
39		return
40	}
41	protos := [2]int32{syscall.IPPROTO_TCP, 0}
42	var buf [32]syscall.WSAProtocolInfo
43	len := uint32(unsafe.Sizeof(buf))
44	n, err := syscall.WSAEnumProtocols(&protos[0], &buf[0], &len)
45	if err != nil {
46		return
47	}
48	for i := int32(0); i < n; i++ {
49		if buf[i].ServiceFlags1&syscall.XP1_IFS_HANDLES == 0 {
50			return
51		}
52	}
53	useSetFileCompletionNotificationModes = true
54}
55
56// InitWSA initiates the use of the Winsock DLL by the current process.
57// It is called from the net package at init time to avoid
58// loading ws2_32.dll when net is not used.
59var InitWSA = sync.OnceFunc(func() {
60	var d syscall.WSAData
61	e := syscall.WSAStartup(uint32(0x202), &d)
62	if e != nil {
63		initErr = e
64	}
65	checkSetFileCompletionNotificationModes()
66})
67
68// operation contains superset of data necessary to perform all async IO.
69type operation struct {
70	// Used by IOCP interface, it must be first field
71	// of the struct, as our code rely on it.
72	o syscall.Overlapped
73
74	// fields used by runtime.netpoll
75	runtimeCtx uintptr
76	mode       int32
77
78	// fields used only by net package
79	fd     *FD
80	buf    syscall.WSABuf
81	msg    windows.WSAMsg
82	sa     syscall.Sockaddr
83	rsa    *syscall.RawSockaddrAny
84	rsan   int32
85	handle syscall.Handle
86	flags  uint32
87	qty    uint32
88	bufs   []syscall.WSABuf
89}
90
91func (o *operation) InitBuf(buf []byte) {
92	o.buf.Len = uint32(len(buf))
93	o.buf.Buf = nil
94	if len(buf) != 0 {
95		o.buf.Buf = &buf[0]
96	}
97}
98
99func (o *operation) InitBufs(buf *[][]byte) {
100	if o.bufs == nil {
101		o.bufs = make([]syscall.WSABuf, 0, len(*buf))
102	} else {
103		o.bufs = o.bufs[:0]
104	}
105	for _, b := range *buf {
106		if len(b) == 0 {
107			o.bufs = append(o.bufs, syscall.WSABuf{})
108			continue
109		}
110		for len(b) > maxRW {
111			o.bufs = append(o.bufs, syscall.WSABuf{Len: maxRW, Buf: &b[0]})
112			b = b[maxRW:]
113		}
114		if len(b) > 0 {
115			o.bufs = append(o.bufs, syscall.WSABuf{Len: uint32(len(b)), Buf: &b[0]})
116		}
117	}
118}
119
120// ClearBufs clears all pointers to Buffers parameter captured
121// by InitBufs, so it can be released by garbage collector.
122func (o *operation) ClearBufs() {
123	for i := range o.bufs {
124		o.bufs[i].Buf = nil
125	}
126	o.bufs = o.bufs[:0]
127}
128
129func (o *operation) InitMsg(p []byte, oob []byte) {
130	o.InitBuf(p)
131	o.msg.Buffers = &o.buf
132	o.msg.BufferCount = 1
133
134	o.msg.Name = nil
135	o.msg.Namelen = 0
136
137	o.msg.Flags = 0
138	o.msg.Control.Len = uint32(len(oob))
139	o.msg.Control.Buf = nil
140	if len(oob) != 0 {
141		o.msg.Control.Buf = &oob[0]
142	}
143}
144
145// execIO executes a single IO operation o. It submits and cancels
146// IO in the current thread for systems where Windows CancelIoEx API
147// is available. Alternatively, it passes the request onto
148// runtime netpoll and waits for completion or cancels request.
149func execIO(o *operation, submit func(o *operation) error) (int, error) {
150	if o.fd.pd.runtimeCtx == 0 {
151		return 0, errors.New("internal error: polling on unsupported descriptor type")
152	}
153
154	fd := o.fd
155	// Notify runtime netpoll about starting IO.
156	err := fd.pd.prepare(int(o.mode), fd.isFile)
157	if err != nil {
158		return 0, err
159	}
160	// Start IO.
161	err = submit(o)
162	switch err {
163	case nil:
164		// IO completed immediately
165		if o.fd.skipSyncNotif {
166			// No completion message will follow, so return immediately.
167			return int(o.qty), nil
168		}
169		// Need to get our completion message anyway.
170	case syscall.ERROR_IO_PENDING:
171		// IO started, and we have to wait for its completion.
172		err = nil
173	default:
174		return 0, err
175	}
176	// Wait for our request to complete.
177	err = fd.pd.wait(int(o.mode), fd.isFile)
178	if err == nil {
179		err = windows.WSAGetOverlappedResult(fd.Sysfd, &o.o, &o.qty, false, &o.flags)
180		// All is good. Extract our IO results and return.
181		if err != nil {
182			// More data available. Return back the size of received data.
183			if err == syscall.ERROR_MORE_DATA || err == windows.WSAEMSGSIZE {
184				return int(o.qty), err
185			}
186			return 0, err
187		}
188		return int(o.qty), nil
189	}
190	// IO is interrupted by "close" or "timeout"
191	netpollErr := err
192	switch netpollErr {
193	case ErrNetClosing, ErrFileClosing, ErrDeadlineExceeded:
194		// will deal with those.
195	default:
196		panic("unexpected runtime.netpoll error: " + netpollErr.Error())
197	}
198	// Cancel our request.
199	err = syscall.CancelIoEx(fd.Sysfd, &o.o)
200	// Assuming ERROR_NOT_FOUND is returned, if IO is completed.
201	if err != nil && err != syscall.ERROR_NOT_FOUND {
202		// TODO(brainman): maybe do something else, but panic.
203		panic(err)
204	}
205	// Wait for cancellation to complete.
206	fd.pd.waitCanceled(int(o.mode))
207	err = windows.WSAGetOverlappedResult(fd.Sysfd, &o.o, &o.qty, false, &o.flags)
208	if err != nil {
209		if err == syscall.ERROR_OPERATION_ABORTED { // IO Canceled
210			err = netpollErr
211		}
212		return 0, err
213	}
214	// We issued a cancellation request. But, it seems, IO operation succeeded
215	// before the cancellation request run. We need to treat the IO operation as
216	// succeeded (the bytes are actually sent/recv from network).
217	return int(o.qty), nil
218}
219
220// FD is a file descriptor. The net and os packages embed this type in
221// a larger type representing a network connection or OS file.
222type FD struct {
223	// Lock sysfd and serialize access to Read and Write methods.
224	fdmu fdMutex
225
226	// System file descriptor. Immutable until Close.
227	Sysfd syscall.Handle
228
229	// Read operation.
230	rop operation
231	// Write operation.
232	wop operation
233
234	// I/O poller.
235	pd pollDesc
236
237	// Used to implement pread/pwrite.
238	l sync.Mutex
239
240	// For console I/O.
241	lastbits       []byte   // first few bytes of the last incomplete rune in last write
242	readuint16     []uint16 // buffer to hold uint16s obtained with ReadConsole
243	readbyte       []byte   // buffer to hold decoding of readuint16 from utf16 to utf8
244	readbyteOffset int      // readbyte[readOffset:] is yet to be consumed with file.Read
245
246	// Semaphore signaled when file is closed.
247	csema uint32
248
249	skipSyncNotif bool
250
251	// Whether this is a streaming descriptor, as opposed to a
252	// packet-based descriptor like a UDP socket.
253	IsStream bool
254
255	// Whether a zero byte read indicates EOF. This is false for a
256	// message based socket connection.
257	ZeroReadIsEOF bool
258
259	// Whether this is a file rather than a network socket.
260	isFile bool
261
262	// The kind of this file.
263	kind fileKind
264}
265
266// fileKind describes the kind of file.
267type fileKind byte
268
269const (
270	kindNet fileKind = iota
271	kindFile
272	kindConsole
273	kindPipe
274)
275
276// logInitFD is set by tests to enable file descriptor initialization logging.
277var logInitFD func(net string, fd *FD, err error)
278
279// Init initializes the FD. The Sysfd field should already be set.
280// This can be called multiple times on a single FD.
281// The net argument is a network name from the net package (e.g., "tcp"),
282// or "file" or "console" or "dir".
283// Set pollable to true if fd should be managed by runtime netpoll.
284func (fd *FD) Init(net string, pollable bool) (string, error) {
285	if initErr != nil {
286		return "", initErr
287	}
288
289	switch net {
290	case "file", "dir":
291		fd.kind = kindFile
292	case "console":
293		fd.kind = kindConsole
294	case "pipe":
295		fd.kind = kindPipe
296	case "tcp", "tcp4", "tcp6",
297		"udp", "udp4", "udp6",
298		"ip", "ip4", "ip6",
299		"unix", "unixgram", "unixpacket":
300		fd.kind = kindNet
301	default:
302		return "", errors.New("internal error: unknown network type " + net)
303	}
304	fd.isFile = fd.kind != kindNet
305
306	var err error
307	if pollable {
308		// Only call init for a network socket.
309		// This means that we don't add files to the runtime poller.
310		// Adding files to the runtime poller can confuse matters
311		// if the user is doing their own overlapped I/O.
312		// See issue #21172.
313		//
314		// In general the code below avoids calling the execIO
315		// function for non-network sockets. If some method does
316		// somehow call execIO, then execIO, and therefore the
317		// calling method, will return an error, because
318		// fd.pd.runtimeCtx will be 0.
319		err = fd.pd.init(fd)
320	}
321	if logInitFD != nil {
322		logInitFD(net, fd, err)
323	}
324	if err != nil {
325		return "", err
326	}
327	if pollable && useSetFileCompletionNotificationModes {
328		// We do not use events, so we can skip them always.
329		flags := uint8(syscall.FILE_SKIP_SET_EVENT_ON_HANDLE)
330		switch net {
331		case "tcp", "tcp4", "tcp6",
332			"udp", "udp4", "udp6":
333			flags |= syscall.FILE_SKIP_COMPLETION_PORT_ON_SUCCESS
334		}
335		err := syscall.SetFileCompletionNotificationModes(fd.Sysfd, flags)
336		if err == nil && flags&syscall.FILE_SKIP_COMPLETION_PORT_ON_SUCCESS != 0 {
337			fd.skipSyncNotif = true
338		}
339	}
340	// Disable SIO_UDP_CONNRESET behavior.
341	// http://support.microsoft.com/kb/263823
342	switch net {
343	case "udp", "udp4", "udp6":
344		ret := uint32(0)
345		flag := uint32(0)
346		size := uint32(unsafe.Sizeof(flag))
347		err := syscall.WSAIoctl(fd.Sysfd, syscall.SIO_UDP_CONNRESET, (*byte)(unsafe.Pointer(&flag)), size, nil, 0, &ret, nil, 0)
348		if err != nil {
349			return "wsaioctl", err
350		}
351	}
352	fd.rop.mode = 'r'
353	fd.wop.mode = 'w'
354	fd.rop.fd = fd
355	fd.wop.fd = fd
356	fd.rop.runtimeCtx = fd.pd.runtimeCtx
357	fd.wop.runtimeCtx = fd.pd.runtimeCtx
358	return "", nil
359}
360
361func (fd *FD) destroy() error {
362	if fd.Sysfd == syscall.InvalidHandle {
363		return syscall.EINVAL
364	}
365	// Poller may want to unregister fd in readiness notification mechanism,
366	// so this must be executed before fd.CloseFunc.
367	fd.pd.close()
368	var err error
369	switch fd.kind {
370	case kindNet:
371		// The net package uses the CloseFunc variable for testing.
372		err = CloseFunc(fd.Sysfd)
373	default:
374		err = syscall.CloseHandle(fd.Sysfd)
375	}
376	fd.Sysfd = syscall.InvalidHandle
377	runtime_Semrelease(&fd.csema)
378	return err
379}
380
381// Close closes the FD. The underlying file descriptor is closed by
382// the destroy method when there are no remaining references.
383func (fd *FD) Close() error {
384	if !fd.fdmu.increfAndClose() {
385		return errClosing(fd.isFile)
386	}
387	if fd.kind == kindPipe {
388		syscall.CancelIoEx(fd.Sysfd, nil)
389	}
390	// unblock pending reader and writer
391	fd.pd.evict()
392	err := fd.decref()
393	// Wait until the descriptor is closed. If this was the only
394	// reference, it is already closed.
395	runtime_Semacquire(&fd.csema)
396	return err
397}
398
399// Windows ReadFile and WSARecv use DWORD (uint32) parameter to pass buffer length.
400// This prevents us reading blocks larger than 4GB.
401// See golang.org/issue/26923.
402const maxRW = 1 << 30 // 1GB is large enough and keeps subsequent reads aligned
403
404// Read implements io.Reader.
405func (fd *FD) Read(buf []byte) (int, error) {
406	if err := fd.readLock(); err != nil {
407		return 0, err
408	}
409	defer fd.readUnlock()
410
411	if len(buf) > maxRW {
412		buf = buf[:maxRW]
413	}
414
415	var n int
416	var err error
417	if fd.isFile {
418		fd.l.Lock()
419		defer fd.l.Unlock()
420		switch fd.kind {
421		case kindConsole:
422			n, err = fd.readConsole(buf)
423		default:
424			n, err = syscall.Read(fd.Sysfd, buf)
425			if fd.kind == kindPipe && err == syscall.ERROR_OPERATION_ABORTED {
426				// Close uses CancelIoEx to interrupt concurrent I/O for pipes.
427				// If the fd is a pipe and the Read was interrupted by CancelIoEx,
428				// we assume it is interrupted by Close.
429				err = ErrFileClosing
430			}
431		}
432		if err != nil {
433			n = 0
434		}
435	} else {
436		o := &fd.rop
437		o.InitBuf(buf)
438		n, err = execIO(o, func(o *operation) error {
439			return syscall.WSARecv(o.fd.Sysfd, &o.buf, 1, &o.qty, &o.flags, &o.o, nil)
440		})
441		if race.Enabled {
442			race.Acquire(unsafe.Pointer(&ioSync))
443		}
444	}
445	if len(buf) != 0 {
446		err = fd.eofError(n, err)
447	}
448	return n, err
449}
450
451var ReadConsole = syscall.ReadConsole // changed for testing
452
453// readConsole reads utf16 characters from console File,
454// encodes them into utf8 and stores them in buffer b.
455// It returns the number of utf8 bytes read and an error, if any.
456func (fd *FD) readConsole(b []byte) (int, error) {
457	if len(b) == 0 {
458		return 0, nil
459	}
460
461	if fd.readuint16 == nil {
462		// Note: syscall.ReadConsole fails for very large buffers.
463		// The limit is somewhere around (but not exactly) 16384.
464		// Stay well below.
465		fd.readuint16 = make([]uint16, 0, 10000)
466		fd.readbyte = make([]byte, 0, 4*cap(fd.readuint16))
467	}
468
469	for fd.readbyteOffset >= len(fd.readbyte) {
470		n := cap(fd.readuint16) - len(fd.readuint16)
471		if n > len(b) {
472			n = len(b)
473		}
474		var nw uint32
475		err := ReadConsole(fd.Sysfd, &fd.readuint16[:len(fd.readuint16)+1][len(fd.readuint16)], uint32(n), &nw, nil)
476		if err != nil {
477			return 0, err
478		}
479		uint16s := fd.readuint16[:len(fd.readuint16)+int(nw)]
480		fd.readuint16 = fd.readuint16[:0]
481		buf := fd.readbyte[:0]
482		for i := 0; i < len(uint16s); i++ {
483			r := rune(uint16s[i])
484			if utf16.IsSurrogate(r) {
485				if i+1 == len(uint16s) {
486					if nw > 0 {
487						// Save half surrogate pair for next time.
488						fd.readuint16 = fd.readuint16[:1]
489						fd.readuint16[0] = uint16(r)
490						break
491					}
492					r = utf8.RuneError
493				} else {
494					r = utf16.DecodeRune(r, rune(uint16s[i+1]))
495					if r != utf8.RuneError {
496						i++
497					}
498				}
499			}
500			buf = utf8.AppendRune(buf, r)
501		}
502		fd.readbyte = buf
503		fd.readbyteOffset = 0
504		if nw == 0 {
505			break
506		}
507	}
508
509	src := fd.readbyte[fd.readbyteOffset:]
510	var i int
511	for i = 0; i < len(src) && i < len(b); i++ {
512		x := src[i]
513		if x == 0x1A { // Ctrl-Z
514			if i == 0 {
515				fd.readbyteOffset++
516			}
517			break
518		}
519		b[i] = x
520	}
521	fd.readbyteOffset += i
522	return i, nil
523}
524
525// Pread emulates the Unix pread system call.
526func (fd *FD) Pread(b []byte, off int64) (int, error) {
527	if fd.kind == kindPipe {
528		// Pread does not work with pipes
529		return 0, syscall.ESPIPE
530	}
531	// Call incref, not readLock, because since pread specifies the
532	// offset it is independent from other reads.
533	if err := fd.incref(); err != nil {
534		return 0, err
535	}
536	defer fd.decref()
537
538	if len(b) > maxRW {
539		b = b[:maxRW]
540	}
541
542	fd.l.Lock()
543	defer fd.l.Unlock()
544	curoffset, e := syscall.Seek(fd.Sysfd, 0, io.SeekCurrent)
545	if e != nil {
546		return 0, e
547	}
548	defer syscall.Seek(fd.Sysfd, curoffset, io.SeekStart)
549	o := syscall.Overlapped{
550		OffsetHigh: uint32(off >> 32),
551		Offset:     uint32(off),
552	}
553	var done uint32
554	e = syscall.ReadFile(fd.Sysfd, b, &done, &o)
555	if e != nil {
556		done = 0
557		if e == syscall.ERROR_HANDLE_EOF {
558			e = io.EOF
559		}
560	}
561	if len(b) != 0 {
562		e = fd.eofError(int(done), e)
563	}
564	return int(done), e
565}
566
567// ReadFrom wraps the recvfrom network call.
568func (fd *FD) ReadFrom(buf []byte) (int, syscall.Sockaddr, error) {
569	if len(buf) == 0 {
570		return 0, nil, nil
571	}
572	if len(buf) > maxRW {
573		buf = buf[:maxRW]
574	}
575	if err := fd.readLock(); err != nil {
576		return 0, nil, err
577	}
578	defer fd.readUnlock()
579	o := &fd.rop
580	o.InitBuf(buf)
581	n, err := execIO(o, func(o *operation) error {
582		if o.rsa == nil {
583			o.rsa = new(syscall.RawSockaddrAny)
584		}
585		o.rsan = int32(unsafe.Sizeof(*o.rsa))
586		return syscall.WSARecvFrom(o.fd.Sysfd, &o.buf, 1, &o.qty, &o.flags, o.rsa, &o.rsan, &o.o, nil)
587	})
588	err = fd.eofError(n, err)
589	if err != nil {
590		return n, nil, err
591	}
592	sa, _ := o.rsa.Sockaddr()
593	return n, sa, nil
594}
595
596// ReadFromInet4 wraps the recvfrom network call for IPv4.
597func (fd *FD) ReadFromInet4(buf []byte, sa4 *syscall.SockaddrInet4) (int, error) {
598	if len(buf) == 0 {
599		return 0, nil
600	}
601	if len(buf) > maxRW {
602		buf = buf[:maxRW]
603	}
604	if err := fd.readLock(); err != nil {
605		return 0, err
606	}
607	defer fd.readUnlock()
608	o := &fd.rop
609	o.InitBuf(buf)
610	n, err := execIO(o, func(o *operation) error {
611		if o.rsa == nil {
612			o.rsa = new(syscall.RawSockaddrAny)
613		}
614		o.rsan = int32(unsafe.Sizeof(*o.rsa))
615		return syscall.WSARecvFrom(o.fd.Sysfd, &o.buf, 1, &o.qty, &o.flags, o.rsa, &o.rsan, &o.o, nil)
616	})
617	err = fd.eofError(n, err)
618	if err != nil {
619		return n, err
620	}
621	rawToSockaddrInet4(o.rsa, sa4)
622	return n, err
623}
624
625// ReadFromInet6 wraps the recvfrom network call for IPv6.
626func (fd *FD) ReadFromInet6(buf []byte, sa6 *syscall.SockaddrInet6) (int, error) {
627	if len(buf) == 0 {
628		return 0, nil
629	}
630	if len(buf) > maxRW {
631		buf = buf[:maxRW]
632	}
633	if err := fd.readLock(); err != nil {
634		return 0, err
635	}
636	defer fd.readUnlock()
637	o := &fd.rop
638	o.InitBuf(buf)
639	n, err := execIO(o, func(o *operation) error {
640		if o.rsa == nil {
641			o.rsa = new(syscall.RawSockaddrAny)
642		}
643		o.rsan = int32(unsafe.Sizeof(*o.rsa))
644		return syscall.WSARecvFrom(o.fd.Sysfd, &o.buf, 1, &o.qty, &o.flags, o.rsa, &o.rsan, &o.o, nil)
645	})
646	err = fd.eofError(n, err)
647	if err != nil {
648		return n, err
649	}
650	rawToSockaddrInet6(o.rsa, sa6)
651	return n, err
652}
653
654// Write implements io.Writer.
655func (fd *FD) Write(buf []byte) (int, error) {
656	if err := fd.writeLock(); err != nil {
657		return 0, err
658	}
659	defer fd.writeUnlock()
660	if fd.isFile {
661		fd.l.Lock()
662		defer fd.l.Unlock()
663	}
664
665	ntotal := 0
666	for len(buf) > 0 {
667		b := buf
668		if len(b) > maxRW {
669			b = b[:maxRW]
670		}
671		var n int
672		var err error
673		if fd.isFile {
674			switch fd.kind {
675			case kindConsole:
676				n, err = fd.writeConsole(b)
677			default:
678				n, err = syscall.Write(fd.Sysfd, b)
679				if fd.kind == kindPipe && err == syscall.ERROR_OPERATION_ABORTED {
680					// Close uses CancelIoEx to interrupt concurrent I/O for pipes.
681					// If the fd is a pipe and the Write was interrupted by CancelIoEx,
682					// we assume it is interrupted by Close.
683					err = ErrFileClosing
684				}
685			}
686			if err != nil {
687				n = 0
688			}
689		} else {
690			if race.Enabled {
691				race.ReleaseMerge(unsafe.Pointer(&ioSync))
692			}
693			o := &fd.wop
694			o.InitBuf(b)
695			n, err = execIO(o, func(o *operation) error {
696				return syscall.WSASend(o.fd.Sysfd, &o.buf, 1, &o.qty, 0, &o.o, nil)
697			})
698		}
699		ntotal += n
700		if err != nil {
701			return ntotal, err
702		}
703		buf = buf[n:]
704	}
705	return ntotal, nil
706}
707
708// writeConsole writes len(b) bytes to the console File.
709// It returns the number of bytes written and an error, if any.
710func (fd *FD) writeConsole(b []byte) (int, error) {
711	n := len(b)
712	runes := make([]rune, 0, 256)
713	if len(fd.lastbits) > 0 {
714		b = append(fd.lastbits, b...)
715		fd.lastbits = nil
716
717	}
718	for len(b) >= utf8.UTFMax || utf8.FullRune(b) {
719		r, l := utf8.DecodeRune(b)
720		runes = append(runes, r)
721		b = b[l:]
722	}
723	if len(b) > 0 {
724		fd.lastbits = make([]byte, len(b))
725		copy(fd.lastbits, b)
726	}
727	// syscall.WriteConsole seems to fail, if given large buffer.
728	// So limit the buffer to 16000 characters. This number was
729	// discovered by experimenting with syscall.WriteConsole.
730	const maxWrite = 16000
731	for len(runes) > 0 {
732		m := len(runes)
733		if m > maxWrite {
734			m = maxWrite
735		}
736		chunk := runes[:m]
737		runes = runes[m:]
738		uint16s := utf16.Encode(chunk)
739		for len(uint16s) > 0 {
740			var written uint32
741			err := syscall.WriteConsole(fd.Sysfd, &uint16s[0], uint32(len(uint16s)), &written, nil)
742			if err != nil {
743				return 0, err
744			}
745			uint16s = uint16s[written:]
746		}
747	}
748	return n, nil
749}
750
751// Pwrite emulates the Unix pwrite system call.
752func (fd *FD) Pwrite(buf []byte, off int64) (int, error) {
753	if fd.kind == kindPipe {
754		// Pwrite does not work with pipes
755		return 0, syscall.ESPIPE
756	}
757	// Call incref, not writeLock, because since pwrite specifies the
758	// offset it is independent from other writes.
759	if err := fd.incref(); err != nil {
760		return 0, err
761	}
762	defer fd.decref()
763
764	fd.l.Lock()
765	defer fd.l.Unlock()
766	curoffset, e := syscall.Seek(fd.Sysfd, 0, io.SeekCurrent)
767	if e != nil {
768		return 0, e
769	}
770	defer syscall.Seek(fd.Sysfd, curoffset, io.SeekStart)
771
772	ntotal := 0
773	for len(buf) > 0 {
774		b := buf
775		if len(b) > maxRW {
776			b = b[:maxRW]
777		}
778		var n uint32
779		o := syscall.Overlapped{
780			OffsetHigh: uint32(off >> 32),
781			Offset:     uint32(off),
782		}
783		e = syscall.WriteFile(fd.Sysfd, b, &n, &o)
784		ntotal += int(n)
785		if e != nil {
786			return ntotal, e
787		}
788		buf = buf[n:]
789		off += int64(n)
790	}
791	return ntotal, nil
792}
793
794// Writev emulates the Unix writev system call.
795func (fd *FD) Writev(buf *[][]byte) (int64, error) {
796	if len(*buf) == 0 {
797		return 0, nil
798	}
799	if err := fd.writeLock(); err != nil {
800		return 0, err
801	}
802	defer fd.writeUnlock()
803	if race.Enabled {
804		race.ReleaseMerge(unsafe.Pointer(&ioSync))
805	}
806	o := &fd.wop
807	o.InitBufs(buf)
808	n, err := execIO(o, func(o *operation) error {
809		return syscall.WSASend(o.fd.Sysfd, &o.bufs[0], uint32(len(o.bufs)), &o.qty, 0, &o.o, nil)
810	})
811	o.ClearBufs()
812	TestHookDidWritev(n)
813	consume(buf, int64(n))
814	return int64(n), err
815}
816
817// WriteTo wraps the sendto network call.
818func (fd *FD) WriteTo(buf []byte, sa syscall.Sockaddr) (int, error) {
819	if err := fd.writeLock(); err != nil {
820		return 0, err
821	}
822	defer fd.writeUnlock()
823
824	if len(buf) == 0 {
825		// handle zero-byte payload
826		o := &fd.wop
827		o.InitBuf(buf)
828		o.sa = sa
829		n, err := execIO(o, func(o *operation) error {
830			return syscall.WSASendto(o.fd.Sysfd, &o.buf, 1, &o.qty, 0, o.sa, &o.o, nil)
831		})
832		return n, err
833	}
834
835	ntotal := 0
836	for len(buf) > 0 {
837		b := buf
838		if len(b) > maxRW {
839			b = b[:maxRW]
840		}
841		o := &fd.wop
842		o.InitBuf(b)
843		o.sa = sa
844		n, err := execIO(o, func(o *operation) error {
845			return syscall.WSASendto(o.fd.Sysfd, &o.buf, 1, &o.qty, 0, o.sa, &o.o, nil)
846		})
847		ntotal += int(n)
848		if err != nil {
849			return ntotal, err
850		}
851		buf = buf[n:]
852	}
853	return ntotal, nil
854}
855
856// WriteToInet4 is WriteTo, specialized for syscall.SockaddrInet4.
857func (fd *FD) WriteToInet4(buf []byte, sa4 *syscall.SockaddrInet4) (int, error) {
858	if err := fd.writeLock(); err != nil {
859		return 0, err
860	}
861	defer fd.writeUnlock()
862
863	if len(buf) == 0 {
864		// handle zero-byte payload
865		o := &fd.wop
866		o.InitBuf(buf)
867		n, err := execIO(o, func(o *operation) error {
868			return windows.WSASendtoInet4(o.fd.Sysfd, &o.buf, 1, &o.qty, 0, sa4, &o.o, nil)
869		})
870		return n, err
871	}
872
873	ntotal := 0
874	for len(buf) > 0 {
875		b := buf
876		if len(b) > maxRW {
877			b = b[:maxRW]
878		}
879		o := &fd.wop
880		o.InitBuf(b)
881		n, err := execIO(o, func(o *operation) error {
882			return windows.WSASendtoInet4(o.fd.Sysfd, &o.buf, 1, &o.qty, 0, sa4, &o.o, nil)
883		})
884		ntotal += int(n)
885		if err != nil {
886			return ntotal, err
887		}
888		buf = buf[n:]
889	}
890	return ntotal, nil
891}
892
893// WriteToInet6 is WriteTo, specialized for syscall.SockaddrInet6.
894func (fd *FD) WriteToInet6(buf []byte, sa6 *syscall.SockaddrInet6) (int, error) {
895	if err := fd.writeLock(); err != nil {
896		return 0, err
897	}
898	defer fd.writeUnlock()
899
900	if len(buf) == 0 {
901		// handle zero-byte payload
902		o := &fd.wop
903		o.InitBuf(buf)
904		n, err := execIO(o, func(o *operation) error {
905			return windows.WSASendtoInet6(o.fd.Sysfd, &o.buf, 1, &o.qty, 0, sa6, &o.o, nil)
906		})
907		return n, err
908	}
909
910	ntotal := 0
911	for len(buf) > 0 {
912		b := buf
913		if len(b) > maxRW {
914			b = b[:maxRW]
915		}
916		o := &fd.wop
917		o.InitBuf(b)
918		n, err := execIO(o, func(o *operation) error {
919			return windows.WSASendtoInet6(o.fd.Sysfd, &o.buf, 1, &o.qty, 0, sa6, &o.o, nil)
920		})
921		ntotal += int(n)
922		if err != nil {
923			return ntotal, err
924		}
925		buf = buf[n:]
926	}
927	return ntotal, nil
928}
929
930// Call ConnectEx. This doesn't need any locking, since it is only
931// called when the descriptor is first created. This is here rather
932// than in the net package so that it can use fd.wop.
933func (fd *FD) ConnectEx(ra syscall.Sockaddr) error {
934	o := &fd.wop
935	o.sa = ra
936	_, err := execIO(o, func(o *operation) error {
937		return ConnectExFunc(o.fd.Sysfd, o.sa, nil, 0, nil, &o.o)
938	})
939	return err
940}
941
942func (fd *FD) acceptOne(s syscall.Handle, rawsa []syscall.RawSockaddrAny, o *operation) (string, error) {
943	// Submit accept request.
944	o.handle = s
945	o.rsan = int32(unsafe.Sizeof(rawsa[0]))
946	_, err := execIO(o, func(o *operation) error {
947		return AcceptFunc(o.fd.Sysfd, o.handle, (*byte)(unsafe.Pointer(&rawsa[0])), 0, uint32(o.rsan), uint32(o.rsan), &o.qty, &o.o)
948	})
949	if err != nil {
950		CloseFunc(s)
951		return "acceptex", err
952	}
953
954	// Inherit properties of the listening socket.
955	err = syscall.Setsockopt(s, syscall.SOL_SOCKET, syscall.SO_UPDATE_ACCEPT_CONTEXT, (*byte)(unsafe.Pointer(&fd.Sysfd)), int32(unsafe.Sizeof(fd.Sysfd)))
956	if err != nil {
957		CloseFunc(s)
958		return "setsockopt", err
959	}
960
961	return "", nil
962}
963
964// Accept handles accepting a socket. The sysSocket parameter is used
965// to allocate the net socket.
966func (fd *FD) Accept(sysSocket func() (syscall.Handle, error)) (syscall.Handle, []syscall.RawSockaddrAny, uint32, string, error) {
967	if err := fd.readLock(); err != nil {
968		return syscall.InvalidHandle, nil, 0, "", err
969	}
970	defer fd.readUnlock()
971
972	o := &fd.rop
973	var rawsa [2]syscall.RawSockaddrAny
974	for {
975		s, err := sysSocket()
976		if err != nil {
977			return syscall.InvalidHandle, nil, 0, "", err
978		}
979
980		errcall, err := fd.acceptOne(s, rawsa[:], o)
981		if err == nil {
982			return s, rawsa[:], uint32(o.rsan), "", nil
983		}
984
985		// Sometimes we see WSAECONNRESET and ERROR_NETNAME_DELETED is
986		// returned here. These happen if connection reset is received
987		// before AcceptEx could complete. These errors relate to new
988		// connection, not to AcceptEx, so ignore broken connection and
989		// try AcceptEx again for more connections.
990		errno, ok := err.(syscall.Errno)
991		if !ok {
992			return syscall.InvalidHandle, nil, 0, errcall, err
993		}
994		switch errno {
995		case syscall.ERROR_NETNAME_DELETED, syscall.WSAECONNRESET:
996			// ignore these and try again
997		default:
998			return syscall.InvalidHandle, nil, 0, errcall, err
999		}
1000	}
1001}
1002
1003// Seek wraps syscall.Seek.
1004func (fd *FD) Seek(offset int64, whence int) (int64, error) {
1005	if fd.kind == kindPipe {
1006		return 0, syscall.ESPIPE
1007	}
1008	if err := fd.incref(); err != nil {
1009		return 0, err
1010	}
1011	defer fd.decref()
1012
1013	fd.l.Lock()
1014	defer fd.l.Unlock()
1015
1016	return syscall.Seek(fd.Sysfd, offset, whence)
1017}
1018
1019// Fchmod updates syscall.ByHandleFileInformation.Fileattributes when needed.
1020func (fd *FD) Fchmod(mode uint32) error {
1021	if err := fd.incref(); err != nil {
1022		return err
1023	}
1024	defer fd.decref()
1025
1026	var d syscall.ByHandleFileInformation
1027	if err := syscall.GetFileInformationByHandle(fd.Sysfd, &d); err != nil {
1028		return err
1029	}
1030	attrs := d.FileAttributes
1031	if mode&syscall.S_IWRITE != 0 {
1032		attrs &^= syscall.FILE_ATTRIBUTE_READONLY
1033	} else {
1034		attrs |= syscall.FILE_ATTRIBUTE_READONLY
1035	}
1036	if attrs == d.FileAttributes {
1037		return nil
1038	}
1039
1040	var du windows.FILE_BASIC_INFO
1041	du.FileAttributes = attrs
1042	return windows.SetFileInformationByHandle(fd.Sysfd, windows.FileBasicInfo, unsafe.Pointer(&du), uint32(unsafe.Sizeof(du)))
1043}
1044
1045// Fchdir wraps syscall.Fchdir.
1046func (fd *FD) Fchdir() error {
1047	if err := fd.incref(); err != nil {
1048		return err
1049	}
1050	defer fd.decref()
1051	return syscall.Fchdir(fd.Sysfd)
1052}
1053
1054// GetFileType wraps syscall.GetFileType.
1055func (fd *FD) GetFileType() (uint32, error) {
1056	if err := fd.incref(); err != nil {
1057		return 0, err
1058	}
1059	defer fd.decref()
1060	return syscall.GetFileType(fd.Sysfd)
1061}
1062
1063// GetFileInformationByHandle wraps GetFileInformationByHandle.
1064func (fd *FD) GetFileInformationByHandle(data *syscall.ByHandleFileInformation) error {
1065	if err := fd.incref(); err != nil {
1066		return err
1067	}
1068	defer fd.decref()
1069	return syscall.GetFileInformationByHandle(fd.Sysfd, data)
1070}
1071
1072// RawRead invokes the user-defined function f for a read operation.
1073func (fd *FD) RawRead(f func(uintptr) bool) error {
1074	if err := fd.readLock(); err != nil {
1075		return err
1076	}
1077	defer fd.readUnlock()
1078	for {
1079		if f(uintptr(fd.Sysfd)) {
1080			return nil
1081		}
1082
1083		// Use a zero-byte read as a way to get notified when this
1084		// socket is readable. h/t https://stackoverflow.com/a/42019668/332798
1085		o := &fd.rop
1086		o.InitBuf(nil)
1087		if !fd.IsStream {
1088			o.flags |= windows.MSG_PEEK
1089		}
1090		_, err := execIO(o, func(o *operation) error {
1091			return syscall.WSARecv(o.fd.Sysfd, &o.buf, 1, &o.qty, &o.flags, &o.o, nil)
1092		})
1093		if err == windows.WSAEMSGSIZE {
1094			// expected with a 0-byte peek, ignore.
1095		} else if err != nil {
1096			return err
1097		}
1098	}
1099}
1100
1101// RawWrite invokes the user-defined function f for a write operation.
1102func (fd *FD) RawWrite(f func(uintptr) bool) error {
1103	if err := fd.writeLock(); err != nil {
1104		return err
1105	}
1106	defer fd.writeUnlock()
1107
1108	if f(uintptr(fd.Sysfd)) {
1109		return nil
1110	}
1111
1112	// TODO(tmm1): find a way to detect socket writability
1113	return syscall.EWINDOWS
1114}
1115
1116func sockaddrInet4ToRaw(rsa *syscall.RawSockaddrAny, sa *syscall.SockaddrInet4) int32 {
1117	*rsa = syscall.RawSockaddrAny{}
1118	raw := (*syscall.RawSockaddrInet4)(unsafe.Pointer(rsa))
1119	raw.Family = syscall.AF_INET
1120	p := (*[2]byte)(unsafe.Pointer(&raw.Port))
1121	p[0] = byte(sa.Port >> 8)
1122	p[1] = byte(sa.Port)
1123	raw.Addr = sa.Addr
1124	return int32(unsafe.Sizeof(*raw))
1125}
1126
1127func sockaddrInet6ToRaw(rsa *syscall.RawSockaddrAny, sa *syscall.SockaddrInet6) int32 {
1128	*rsa = syscall.RawSockaddrAny{}
1129	raw := (*syscall.RawSockaddrInet6)(unsafe.Pointer(rsa))
1130	raw.Family = syscall.AF_INET6
1131	p := (*[2]byte)(unsafe.Pointer(&raw.Port))
1132	p[0] = byte(sa.Port >> 8)
1133	p[1] = byte(sa.Port)
1134	raw.Scope_id = sa.ZoneId
1135	raw.Addr = sa.Addr
1136	return int32(unsafe.Sizeof(*raw))
1137}
1138
1139func rawToSockaddrInet4(rsa *syscall.RawSockaddrAny, sa *syscall.SockaddrInet4) {
1140	pp := (*syscall.RawSockaddrInet4)(unsafe.Pointer(rsa))
1141	p := (*[2]byte)(unsafe.Pointer(&pp.Port))
1142	sa.Port = int(p[0])<<8 + int(p[1])
1143	sa.Addr = pp.Addr
1144}
1145
1146func rawToSockaddrInet6(rsa *syscall.RawSockaddrAny, sa *syscall.SockaddrInet6) {
1147	pp := (*syscall.RawSockaddrInet6)(unsafe.Pointer(rsa))
1148	p := (*[2]byte)(unsafe.Pointer(&pp.Port))
1149	sa.Port = int(p[0])<<8 + int(p[1])
1150	sa.ZoneId = pp.Scope_id
1151	sa.Addr = pp.Addr
1152}
1153
1154func sockaddrToRaw(rsa *syscall.RawSockaddrAny, sa syscall.Sockaddr) (int32, error) {
1155	switch sa := sa.(type) {
1156	case *syscall.SockaddrInet4:
1157		sz := sockaddrInet4ToRaw(rsa, sa)
1158		return sz, nil
1159	case *syscall.SockaddrInet6:
1160		sz := sockaddrInet6ToRaw(rsa, sa)
1161		return sz, nil
1162	default:
1163		return 0, syscall.EWINDOWS
1164	}
1165}
1166
1167// ReadMsg wraps the WSARecvMsg network call.
1168func (fd *FD) ReadMsg(p []byte, oob []byte, flags int) (int, int, int, syscall.Sockaddr, error) {
1169	if err := fd.readLock(); err != nil {
1170		return 0, 0, 0, nil, err
1171	}
1172	defer fd.readUnlock()
1173
1174	if len(p) > maxRW {
1175		p = p[:maxRW]
1176	}
1177
1178	o := &fd.rop
1179	o.InitMsg(p, oob)
1180	if o.rsa == nil {
1181		o.rsa = new(syscall.RawSockaddrAny)
1182	}
1183	o.msg.Name = (syscall.Pointer)(unsafe.Pointer(o.rsa))
1184	o.msg.Namelen = int32(unsafe.Sizeof(*o.rsa))
1185	o.msg.Flags = uint32(flags)
1186	n, err := execIO(o, func(o *operation) error {
1187		return windows.WSARecvMsg(o.fd.Sysfd, &o.msg, &o.qty, &o.o, nil)
1188	})
1189	err = fd.eofError(n, err)
1190	var sa syscall.Sockaddr
1191	if err == nil {
1192		sa, err = o.rsa.Sockaddr()
1193	}
1194	return n, int(o.msg.Control.Len), int(o.msg.Flags), sa, err
1195}
1196
1197// ReadMsgInet4 is ReadMsg, but specialized to return a syscall.SockaddrInet4.
1198func (fd *FD) ReadMsgInet4(p []byte, oob []byte, flags int, sa4 *syscall.SockaddrInet4) (int, int, int, error) {
1199	if err := fd.readLock(); err != nil {
1200		return 0, 0, 0, err
1201	}
1202	defer fd.readUnlock()
1203
1204	if len(p) > maxRW {
1205		p = p[:maxRW]
1206	}
1207
1208	o := &fd.rop
1209	o.InitMsg(p, oob)
1210	if o.rsa == nil {
1211		o.rsa = new(syscall.RawSockaddrAny)
1212	}
1213	o.msg.Name = (syscall.Pointer)(unsafe.Pointer(o.rsa))
1214	o.msg.Namelen = int32(unsafe.Sizeof(*o.rsa))
1215	o.msg.Flags = uint32(flags)
1216	n, err := execIO(o, func(o *operation) error {
1217		return windows.WSARecvMsg(o.fd.Sysfd, &o.msg, &o.qty, &o.o, nil)
1218	})
1219	err = fd.eofError(n, err)
1220	if err == nil {
1221		rawToSockaddrInet4(o.rsa, sa4)
1222	}
1223	return n, int(o.msg.Control.Len), int(o.msg.Flags), err
1224}
1225
1226// ReadMsgInet6 is ReadMsg, but specialized to return a syscall.SockaddrInet6.
1227func (fd *FD) ReadMsgInet6(p []byte, oob []byte, flags int, sa6 *syscall.SockaddrInet6) (int, int, int, error) {
1228	if err := fd.readLock(); err != nil {
1229		return 0, 0, 0, err
1230	}
1231	defer fd.readUnlock()
1232
1233	if len(p) > maxRW {
1234		p = p[:maxRW]
1235	}
1236
1237	o := &fd.rop
1238	o.InitMsg(p, oob)
1239	if o.rsa == nil {
1240		o.rsa = new(syscall.RawSockaddrAny)
1241	}
1242	o.msg.Name = (syscall.Pointer)(unsafe.Pointer(o.rsa))
1243	o.msg.Namelen = int32(unsafe.Sizeof(*o.rsa))
1244	o.msg.Flags = uint32(flags)
1245	n, err := execIO(o, func(o *operation) error {
1246		return windows.WSARecvMsg(o.fd.Sysfd, &o.msg, &o.qty, &o.o, nil)
1247	})
1248	err = fd.eofError(n, err)
1249	if err == nil {
1250		rawToSockaddrInet6(o.rsa, sa6)
1251	}
1252	return n, int(o.msg.Control.Len), int(o.msg.Flags), err
1253}
1254
1255// WriteMsg wraps the WSASendMsg network call.
1256func (fd *FD) WriteMsg(p []byte, oob []byte, sa syscall.Sockaddr) (int, int, error) {
1257	if len(p) > maxRW {
1258		return 0, 0, errors.New("packet is too large (only 1GB is allowed)")
1259	}
1260
1261	if err := fd.writeLock(); err != nil {
1262		return 0, 0, err
1263	}
1264	defer fd.writeUnlock()
1265
1266	o := &fd.wop
1267	o.InitMsg(p, oob)
1268	if sa != nil {
1269		if o.rsa == nil {
1270			o.rsa = new(syscall.RawSockaddrAny)
1271		}
1272		len, err := sockaddrToRaw(o.rsa, sa)
1273		if err != nil {
1274			return 0, 0, err
1275		}
1276		o.msg.Name = (syscall.Pointer)(unsafe.Pointer(o.rsa))
1277		o.msg.Namelen = len
1278	}
1279	n, err := execIO(o, func(o *operation) error {
1280		return windows.WSASendMsg(o.fd.Sysfd, &o.msg, 0, &o.qty, &o.o, nil)
1281	})
1282	return n, int(o.msg.Control.Len), err
1283}
1284
1285// WriteMsgInet4 is WriteMsg specialized for syscall.SockaddrInet4.
1286func (fd *FD) WriteMsgInet4(p []byte, oob []byte, sa *syscall.SockaddrInet4) (int, int, error) {
1287	if len(p) > maxRW {
1288		return 0, 0, errors.New("packet is too large (only 1GB is allowed)")
1289	}
1290
1291	if err := fd.writeLock(); err != nil {
1292		return 0, 0, err
1293	}
1294	defer fd.writeUnlock()
1295
1296	o := &fd.wop
1297	o.InitMsg(p, oob)
1298	if o.rsa == nil {
1299		o.rsa = new(syscall.RawSockaddrAny)
1300	}
1301	len := sockaddrInet4ToRaw(o.rsa, sa)
1302	o.msg.Name = (syscall.Pointer)(unsafe.Pointer(o.rsa))
1303	o.msg.Namelen = len
1304	n, err := execIO(o, func(o *operation) error {
1305		return windows.WSASendMsg(o.fd.Sysfd, &o.msg, 0, &o.qty, &o.o, nil)
1306	})
1307	return n, int(o.msg.Control.Len), err
1308}
1309
1310// WriteMsgInet6 is WriteMsg specialized for syscall.SockaddrInet6.
1311func (fd *FD) WriteMsgInet6(p []byte, oob []byte, sa *syscall.SockaddrInet6) (int, int, error) {
1312	if len(p) > maxRW {
1313		return 0, 0, errors.New("packet is too large (only 1GB is allowed)")
1314	}
1315
1316	if err := fd.writeLock(); err != nil {
1317		return 0, 0, err
1318	}
1319	defer fd.writeUnlock()
1320
1321	o := &fd.wop
1322	o.InitMsg(p, oob)
1323	if o.rsa == nil {
1324		o.rsa = new(syscall.RawSockaddrAny)
1325	}
1326	len := sockaddrInet6ToRaw(o.rsa, sa)
1327	o.msg.Name = (syscall.Pointer)(unsafe.Pointer(o.rsa))
1328	o.msg.Namelen = len
1329	n, err := execIO(o, func(o *operation) error {
1330		return windows.WSASendMsg(o.fd.Sysfd, &o.msg, 0, &o.qty, &o.o, nil)
1331	})
1332	return n, int(o.msg.Control.Len), err
1333}
1334
1335func DupCloseOnExec(fd int) (int, string, error) {
1336	proc, err := syscall.GetCurrentProcess()
1337	if err != nil {
1338		return 0, "GetCurrentProcess", err
1339	}
1340
1341	var nfd syscall.Handle
1342	const inherit = false // analogous to CLOEXEC
1343	if err := syscall.DuplicateHandle(proc, syscall.Handle(fd), proc, &nfd, 0, inherit, syscall.DUPLICATE_SAME_ACCESS); err != nil {
1344		return 0, "DuplicateHandle", err
1345	}
1346	return int(nfd), "", nil
1347}
1348