2 * Copyright (c) 2015, 2016 VMware, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "Conntrack.h"
18 #include "IpFragment.h"
20 #include "PacketParser.h"
22 #include "Conntrack-nat.h"
25 #pragma warning(disable:4311)
27 #define WINDOWS_TICK 10000000
28 #define SEC_TO_UNIX_EPOCH 11644473600LL
29 #define SEC_TO_NANOSEC 1000000000LL
31 KSTART_ROUTINE OvsConntrackEntryCleaner
;
32 static PLIST_ENTRY ovsConntrackTable
;
33 static OVS_CT_THREAD_CTX ctThreadCtx
;
34 static PNDIS_RW_LOCK_EX ovsConntrackLockObj
;
35 static PNDIS_RW_LOCK_EX ovsCtNatLockObj
;
36 extern POVS_SWITCH_CONTEXT gOvsSwitchContext
;
37 static LONG ctTotalEntries
;
39 static __inline
OvsCtFlush(UINT16 zone
, struct ovs_key_ct_tuple_ipv4
*tuple
);
40 static __inline NDIS_STATUS
41 MapNlToCtTuple(POVS_MESSAGE msgIn
, PNL_ATTR attr
,
42 struct ovs_key_ct_tuple_ipv4
*ct_tuple
);
44 *----------------------------------------------------------------------------
46 * Initialize the components used by Connection Tracking
47 *----------------------------------------------------------------------------
50 OvsInitConntrack(POVS_SWITCH_CONTEXT context
)
53 HANDLE threadHandle
= NULL
;
56 /* Init the sync-lock */
57 ovsConntrackLockObj
= NdisAllocateRWLock(context
->NdisFilterHandle
);
58 if (ovsConntrackLockObj
== NULL
) {
59 return STATUS_INSUFFICIENT_RESOURCES
;
62 ovsCtNatLockObj
= NdisAllocateRWLock(context
->NdisFilterHandle
);
63 if (ovsCtNatLockObj
== NULL
) {
64 NdisFreeRWLock(ovsConntrackLockObj
);
65 ovsConntrackLockObj
= NULL
;
66 return STATUS_INSUFFICIENT_RESOURCES
;
69 /* Init the Hash Buffer */
70 ovsConntrackTable
= OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY
)
73 if (ovsConntrackTable
== NULL
) {
74 NdisFreeRWLock(ovsConntrackLockObj
);
75 ovsConntrackLockObj
= NULL
;
76 NdisFreeRWLock(ovsCtNatLockObj
);
77 ovsCtNatLockObj
= NULL
;
78 return STATUS_INSUFFICIENT_RESOURCES
;
81 for (int i
= 0; i
< CT_HASH_TABLE_SIZE
; i
++) {
82 InitializeListHead(&ovsConntrackTable
[i
]);
85 /* Init CT Cleaner Thread */
86 KeInitializeEvent(&ctThreadCtx
.event
, NotificationEvent
, FALSE
);
87 status
= PsCreateSystemThread(&threadHandle
, SYNCHRONIZE
, NULL
, NULL
,
88 NULL
, OvsConntrackEntryCleaner
,
91 if (status
!= STATUS_SUCCESS
) {
92 NdisFreeRWLock(ovsConntrackLockObj
);
93 ovsConntrackLockObj
= NULL
;
95 NdisFreeRWLock(ovsCtNatLockObj
);
96 ovsCtNatLockObj
= NULL
;
98 OvsFreeMemoryWithTag(ovsConntrackTable
, OVS_CT_POOL_TAG
);
99 ovsConntrackTable
= NULL
;
104 ObReferenceObjectByHandle(threadHandle
, SYNCHRONIZE
, NULL
, KernelMode
,
105 &ctThreadCtx
.threadObject
, NULL
);
106 ZwClose(threadHandle
);
109 status
= OvsNatInit();
111 if (status
!= STATUS_SUCCESS
) {
112 OvsCleanupConntrack();
115 return STATUS_SUCCESS
;
119 *----------------------------------------------------------------------------
120 * OvsCleanupConntrack
121 * Cleanup memory and thread that were spawned for Connection tracking
122 *----------------------------------------------------------------------------
125 OvsCleanupConntrack(VOID
)
127 LOCK_STATE_EX lockState
, lockStateNat
;
128 NdisAcquireRWLockWrite(ovsConntrackLockObj
, &lockState
, 0);
129 ctThreadCtx
.exit
= 1;
130 KeSetEvent(&ctThreadCtx
.event
, 0, FALSE
);
131 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
133 KeWaitForSingleObject(ctThreadCtx
.threadObject
, Executive
,
134 KernelMode
, FALSE
, NULL
);
135 ObDereferenceObject(ctThreadCtx
.threadObject
);
137 /* Force flush all entries before removing */
140 if (ovsConntrackTable
) {
141 OvsFreeMemoryWithTag(ovsConntrackTable
, OVS_CT_POOL_TAG
);
142 ovsConntrackTable
= NULL
;
145 NdisFreeRWLock(ovsConntrackLockObj
);
146 ovsConntrackLockObj
= NULL
;
147 NdisAcquireRWLockWrite(ovsCtNatLockObj
, &lockStateNat
, 0);
149 NdisReleaseRWLock(ovsCtNatLockObj
, &lockStateNat
);
150 NdisFreeRWLock(ovsCtNatLockObj
);
151 ovsCtNatLockObj
= NULL
;
155 OvsCtKeyReverse(OVS_CT_KEY
*key
)
157 struct ct_endpoint tmp
;
164 OvsCtUpdateFlowKey(struct OvsFlowKey
*key
,
168 struct ovs_key_ct_labels
*labels
)
170 key
->ct
.state
= state
| OVS_CS_F_TRACKED
;
174 NdisMoveMemory(&key
->ct
.labels
, labels
,
175 sizeof(struct ovs_key_ct_labels
));
177 memset(&key
->ct
.labels
, 0,
178 sizeof(struct ovs_key_ct_labels
));
183 OvsPostCtEventEntry(POVS_CT_ENTRY entry
, UINT8 type
)
185 OVS_CT_EVENT_ENTRY ctEventEntry
= {0};
186 NdisMoveMemory(&ctEventEntry
.entry
, entry
, sizeof(OVS_CT_ENTRY
));
187 ctEventEntry
.type
= type
;
188 OvsPostCtEvent(&ctEventEntry
);
192 OvsCtIncrementCounters(POVS_CT_ENTRY entry
, BOOLEAN reply
, PNET_BUFFER_LIST nbl
)
195 entry
->rev_key
.byteCount
+= OvsPacketLenNBL(nbl
);
196 entry
->rev_key
.packetCount
++;
198 entry
->key
.byteCount
+= OvsPacketLenNBL(nbl
);
199 entry
->key
.packetCount
++;
203 static __inline BOOLEAN
204 OvsCtAddEntry(POVS_CT_ENTRY entry
, OvsConntrackKeyLookupCtx
*ctx
,
205 PNAT_ACTION_INFO natInfo
, UINT64 now
)
207 NdisMoveMemory(&entry
->key
, &ctx
->key
, sizeof(OVS_CT_KEY
));
208 NdisMoveMemory(&entry
->rev_key
, &ctx
->key
, sizeof(OVS_CT_KEY
));
209 OvsCtKeyReverse(&entry
->rev_key
);
211 /* NatInfo is always initialized to be disabled, so that if NAT action
212 * fails, we will not end up deleting an non-existent NAT entry.
214 if (natInfo
== NULL
) {
215 entry
->natInfo
.natAction
= NAT_ACTION_NONE
;
217 LOCK_STATE_EX lockStateNat
;
218 NdisAcquireRWLockWrite(ovsCtNatLockObj
, &lockStateNat
, 0);
219 if (OvsIsForwardNat(natInfo
->natAction
)) {
220 entry
->natInfo
= *natInfo
;
221 if (!OvsNatTranslateCtEntry(entry
)) {
222 NdisReleaseRWLock(ovsCtNatLockObj
, &lockStateNat
);
225 ctx
->hash
= OvsHashCtKey(&entry
->key
);
227 entry
->natInfo
.natAction
= natInfo
->natAction
;
229 NdisReleaseRWLock(ovsCtNatLockObj
, &lockStateNat
);
232 entry
->timestampStart
= now
;
233 InsertHeadList(&ovsConntrackTable
[ctx
->hash
& CT_HASH_TABLE_MASK
],
236 InterlockedIncrement((LONG
volatile *)&ctTotalEntries
);
240 static __inline POVS_CT_ENTRY
241 OvsCtEntryCreate(OvsForwardingContext
*fwdCtx
,
244 OvsConntrackKeyLookupCtx
*ctx
,
246 PNAT_ACTION_INFO natInfo
,
249 BOOLEAN
*entryCreated
)
251 POVS_CT_ENTRY entry
= NULL
;
253 POVS_CT_ENTRY parentEntry
;
254 PNET_BUFFER_LIST curNbl
= fwdCtx
->curNbl
;
256 *entryCreated
= FALSE
;
257 state
|= OVS_CS_F_NEW
;
264 tcp
= OvsGetTcp(curNbl
, l4Offset
, &tcpStorage
);
265 if (!OvsConntrackValidateTcpPacket(tcp
)) {
266 state
= OVS_CS_F_INVALID
;
271 entry
= OvsConntrackCreateTcpEntry(tcp
, curNbl
, currentTime
);
279 icmp
= OvsGetIcmp(curNbl
, l4Offset
, &storage
);
280 if (!OvsConntrackValidateIcmpPacket(icmp
)) {
281 state
= OVS_CS_F_INVALID
;
286 entry
= OvsConntrackCreateIcmpEntry(currentTime
);
293 entry
= OvsConntrackCreateOtherEntry(currentTime
);
298 state
= OVS_CS_F_INVALID
;
302 parentEntry
= OvsCtRelatedLookup(ctx
->key
, currentTime
);
303 if (parentEntry
!= NULL
&& state
!= OVS_CS_F_INVALID
) {
304 state
|= OVS_CS_F_RELATED
;
307 if (state
!= OVS_CS_F_INVALID
&& commit
) {
309 entry
->parent
= parentEntry
;
310 if (OvsCtAddEntry(entry
, ctx
, natInfo
, currentTime
)) {
311 *entryCreated
= TRUE
;
313 /* Unable to add entry to the list */
314 OvsFreeMemoryWithTag(entry
, OVS_CT_POOL_TAG
);
315 state
= OVS_CS_F_INVALID
;
319 /* OvsAllocateMemoryWithTag returned NULL; treat as invalid */
320 state
= OVS_CS_F_INVALID
;
324 OvsCtUpdateFlowKey(key
, state
, ctx
->key
.zone
, 0, NULL
);
326 OvsCtIncrementCounters(entry
, ctx
->reply
, curNbl
);
331 static enum CT_UPDATE_RES
332 OvsCtUpdateEntry(OVS_CT_ENTRY
* entry
,
333 PNET_BUFFER_LIST nbl
,
339 CT_UPDATE_RES status
;
345 tcp
= OvsGetTcp(nbl
, l4Offset
, &tcpStorage
);
347 status
= CT_UPDATE_INVALID
;
350 status
= OvsConntrackUpdateTcpEntry(entry
, tcp
, nbl
, reply
, now
);
354 status
= OvsConntrackUpdateIcmpEntry(entry
, reply
, now
);
357 status
= OvsConntrackUpdateOtherEntry(entry
, reply
, now
);
360 status
= CT_UPDATE_INVALID
;
366 static __inline BOOLEAN
367 OvsCtEntryExpired(POVS_CT_ENTRY entry
)
370 NdisGetCurrentSystemTime((LARGE_INTEGER
*)¤tTime
);
371 return entry
->expiration
< currentTime
;
375 OvsCtEntryDelete(POVS_CT_ENTRY entry
, BOOLEAN forceDelete
)
380 if (forceDelete
|| OvsCtEntryExpired(entry
)) {
381 if (entry
->natInfo
.natAction
) {
382 LOCK_STATE_EX lockStateNat
;
383 NdisAcquireRWLockWrite(ovsCtNatLockObj
, &lockStateNat
, 0);
384 OvsNatDeleteKey(&entry
->key
);
385 NdisReleaseRWLock(ovsCtNatLockObj
, &lockStateNat
);
387 OvsPostCtEventEntry(entry
, OVS_EVENT_CT_DELETE
);
388 RemoveEntryList(&entry
->link
);
389 OvsFreeMemoryWithTag(entry
, OVS_CT_POOL_TAG
);
390 InterlockedDecrement((LONG
volatile*)&ctTotalEntries
);
395 static __inline NDIS_STATUS
396 OvsDetectCtPacket(OvsForwardingContext
*fwdCtx
,
398 PNET_BUFFER_LIST
*newNbl
)
400 /* Currently we support only Unfragmented TCP packets */
401 switch (ntohs(key
->l2
.dlType
)) {
403 if (key
->ipKey
.nwFrag
!= OVS_FRAG_TYPE_NONE
) {
404 return OvsProcessIpv4Fragment(fwdCtx
->switchContext
,
406 fwdCtx
->completionList
,
407 fwdCtx
->fwdDetail
->SourcePortId
,
408 key
->tunKey
.tunnelId
,
411 if (key
->ipKey
.nwProto
== IPPROTO_TCP
412 || key
->ipKey
.nwProto
== IPPROTO_UDP
413 || key
->ipKey
.nwProto
== IPPROTO_ICMP
) {
414 return NDIS_STATUS_SUCCESS
;
416 return NDIS_STATUS_NOT_SUPPORTED
;
418 return NDIS_STATUS_NOT_SUPPORTED
;
421 return NDIS_STATUS_NOT_SUPPORTED
;
425 OvsCtKeyAreSame(OVS_CT_KEY ctxKey
, OVS_CT_KEY entryKey
)
427 return ((NdisEqualMemory(&ctxKey
.src
, &entryKey
.src
,
428 sizeof(struct ct_endpoint
))) &&
429 (NdisEqualMemory(&ctxKey
.dst
, &entryKey
.dst
,
430 sizeof(struct ct_endpoint
))) &&
431 (ctxKey
.dl_type
== entryKey
.dl_type
) &&
432 (ctxKey
.nw_proto
== entryKey
.nw_proto
) &&
433 (ctxKey
.zone
== entryKey
.zone
));
437 OvsCtLookup(OvsConntrackKeyLookupCtx
*ctx
)
441 BOOLEAN reply
= FALSE
;
442 POVS_CT_ENTRY found
= NULL
;
444 /* Reverse NAT must be performed before OvsCtLookup, so here
445 * we simply need to flip the src and dst in key and compare
446 * they are equal. Note that flipped key is not equal to
447 * rev_key due to NAT effect.
449 OVS_CT_KEY revCtxKey
= ctx
->key
;
450 OvsCtKeyReverse(&revCtxKey
);
452 if (!ctTotalEntries
) {
456 LIST_FORALL(&ovsConntrackTable
[ctx
->hash
& CT_HASH_TABLE_MASK
], link
) {
457 entry
= CONTAINING_RECORD(link
, OVS_CT_ENTRY
, link
);
459 if (OvsCtKeyAreSame(ctx
->key
, entry
->key
)) {
464 if (!found
&& OvsCtKeyAreSame(revCtxKey
, entry
->key
)) {
470 if (OvsCtEntryExpired(found
)) {
484 OvsHashCtKey(const OVS_CT_KEY
*key
)
486 UINT32 hsrc
, hdst
, hash
;
487 hsrc
= OvsJhashBytes((UINT32
*) &key
->src
, sizeof(key
->src
), 0);
488 hdst
= OvsJhashBytes((UINT32
*) &key
->dst
, sizeof(key
->dst
), 0);
489 hash
= hsrc
^ hdst
; /* TO identify reverse traffic */
490 hash
= OvsJhashBytes((uint32_t *) &key
->dst
+ 1,
491 ((uint32_t *) (key
+ 1) -
492 (uint32_t *) (&key
->dst
+ 1)),
498 OvsReverseIcmpType(UINT8 type
)
501 case ICMP4_ECHO_REQUEST
:
502 return ICMP4_ECHO_REPLY
;
503 case ICMP4_ECHO_REPLY
:
504 return ICMP4_ECHO_REQUEST
;
505 case ICMP4_TIMESTAMP_REQUEST
:
506 return ICMP4_TIMESTAMP_REPLY
;
507 case ICMP4_TIMESTAMP_REPLY
:
508 return ICMP4_TIMESTAMP_REQUEST
;
509 case ICMP4_INFO_REQUEST
:
510 return ICMP4_INFO_REPLY
;
511 case ICMP4_INFO_REPLY
:
512 return ICMP4_INFO_REQUEST
;
518 static __inline NDIS_STATUS
519 OvsCtSetupLookupCtx(OvsFlowKey
*flowKey
,
521 OvsConntrackKeyLookupCtx
*ctx
,
522 PNET_BUFFER_LIST curNbl
,
525 const OVS_NAT_ENTRY
*natEntry
;
526 ctx
->key
.zone
= zone
;
527 ctx
->key
.dl_type
= flowKey
->l2
.dlType
;
528 ctx
->related
= FALSE
;
530 /* Extract L3 and L4*/
531 if (flowKey
->l2
.dlType
== htons(ETH_TYPE_IPV4
)) {
532 ctx
->key
.src
.addr
.ipv4
= flowKey
->ipKey
.nwSrc
;
533 ctx
->key
.dst
.addr
.ipv4
= flowKey
->ipKey
.nwDst
;
534 ctx
->key
.nw_proto
= flowKey
->ipKey
.nwProto
;
536 ctx
->key
.src
.port
= flowKey
->ipKey
.l4
.tpSrc
;
537 ctx
->key
.dst
.port
= flowKey
->ipKey
.l4
.tpDst
;
538 if (flowKey
->ipKey
.nwProto
== IPPROTO_ICMP
) {
541 icmp
= OvsGetIcmp(curNbl
, l4Offset
, &icmpStorage
);
544 /* Related bit is set when ICMP has an error */
545 /* XXX parse out the appropriate src and dst from inner pkt */
546 switch (icmp
->type
) {
547 case ICMP4_ECHO_REQUEST
:
548 case ICMP4_ECHO_REPLY
:
549 case ICMP4_TIMESTAMP_REQUEST
:
550 case ICMP4_TIMESTAMP_REPLY
:
551 case ICMP4_INFO_REQUEST
:
552 case ICMP4_INFO_REPLY
:
553 if (icmp
->code
!= 0) {
554 return NDIS_STATUS_INVALID_PACKET
;
556 /* Separate ICMP connection: identified using id */
557 ctx
->key
.dst
.icmp_id
= icmp
->fields
.echo
.id
;
558 ctx
->key
.src
.icmp_id
= icmp
->fields
.echo
.id
;
559 ctx
->key
.src
.icmp_type
= icmp
->type
;
560 ctx
->key
.dst
.icmp_type
= OvsReverseIcmpType(icmp
->type
);
562 case ICMP4_DEST_UNREACH
:
563 case ICMP4_TIME_EXCEEDED
:
564 case ICMP4_PARAM_PROB
:
565 case ICMP4_SOURCE_QUENCH
:
566 case ICMP4_REDIRECT
: {
567 /* XXX Handle inner packet */
572 ctx
->related
= FALSE
;
575 } else if (flowKey
->l2
.dlType
== htons(ETH_TYPE_IPV6
)) {
576 ctx
->key
.src
.addr
.ipv6
= flowKey
->ipv6Key
.ipv6Src
;
577 ctx
->key
.dst
.addr
.ipv6
= flowKey
->ipv6Key
.ipv6Dst
;
578 ctx
->key
.nw_proto
= flowKey
->ipv6Key
.nwProto
;
580 ctx
->key
.src
.port
= flowKey
->ipv6Key
.l4
.tpSrc
;
581 ctx
->key
.dst
.port
= flowKey
->ipv6Key
.l4
.tpDst
;
582 /* XXX Handle ICMPv6 errors*/
584 return NDIS_STATUS_INVALID_PACKET
;
587 LOCK_STATE_EX lockStateNat
;
588 NdisAcquireRWLockRead(ovsCtNatLockObj
, &lockStateNat
, 0);
589 natEntry
= OvsNatLookup(&ctx
->key
, TRUE
);
590 NdisReleaseRWLock(ovsCtNatLockObj
, &lockStateNat
);
592 /* Translate address first for reverse NAT */
593 ctx
->key
= natEntry
->ctEntry
->key
;
594 OvsCtKeyReverse(&ctx
->key
);
597 ctx
->hash
= OvsHashCtKey(&ctx
->key
);
598 return NDIS_STATUS_SUCCESS
;
601 static __inline BOOLEAN
602 OvsDetectFtpPacket(OvsFlowKey
*key
) {
603 return (key
->ipKey
.nwProto
== IPPROTO_TCP
&&
604 (ntohs(key
->ipKey
.l4
.tpDst
) == IPPORT_FTP
||
605 ntohs(key
->ipKey
.l4
.tpSrc
) == IPPORT_FTP
));
609 *----------------------------------------------------------------------------
610 * OvsProcessConntrackEntry
611 * Check the TCP flags and set the ct_state of the entry
612 *----------------------------------------------------------------------------
614 static __inline POVS_CT_ENTRY
615 OvsProcessConntrackEntry(OvsForwardingContext
*fwdCtx
,
617 OvsConntrackKeyLookupCtx
*ctx
,
620 NAT_ACTION_INFO
*natInfo
,
623 BOOLEAN
*entryCreated
)
625 POVS_CT_ENTRY entry
= ctx
->entry
;
627 PNET_BUFFER_LIST curNbl
= fwdCtx
->curNbl
;
628 *entryCreated
= FALSE
;
630 /* If an entry was found, update the state based on TCP flags */
632 state
|= OVS_CS_F_RELATED
;
634 state
|= OVS_CS_F_REPLY_DIR
;
637 CT_UPDATE_RES result
;
638 result
= OvsCtUpdateEntry(entry
, curNbl
, key
->ipKey
.nwProto
,
639 l4Offset
, ctx
->reply
, currentTime
);
641 case CT_UPDATE_VALID
:
642 state
|= OVS_CS_F_ESTABLISHED
;
644 state
|= OVS_CS_F_REPLY_DIR
;
647 case CT_UPDATE_INVALID
:
648 state
|= OVS_CS_F_INVALID
;
651 //Delete and update the Conntrack
652 OvsCtEntryDelete(ctx
->entry
, TRUE
);
654 entry
= OvsCtEntryCreate(fwdCtx
, key
->ipKey
.nwProto
, l4Offset
,
655 ctx
, key
, natInfo
, commit
, currentTime
,
664 if (key
->ipKey
.nwProto
== IPPROTO_TCP
&& entry
) {
665 /* Update the related bit if there is a parent */
667 state
|= OVS_CS_F_RELATED
;
669 POVS_CT_ENTRY parentEntry
;
670 parentEntry
= OvsCtRelatedLookup(ctx
->key
, currentTime
);
671 entry
->parent
= parentEntry
;
672 if (parentEntry
!= NULL
) {
673 state
|= OVS_CS_F_RELATED
;
678 /* Copy mark and label from entry into flowKey. If actions specify
679 different mark and label, update the flowKey. */
681 OvsCtUpdateFlowKey(key
, state
, zone
, entry
->mark
, &entry
->labels
);
683 OvsCtUpdateFlowKey(key
, state
, zone
, 0, NULL
);
689 OvsConntrackSetMark(OvsFlowKey
*key
,
693 BOOLEAN
*markChanged
)
696 newMark
= value
| (entry
->mark
& ~(mask
));
697 if (entry
->mark
!= newMark
) {
698 entry
->mark
= newMark
;
699 key
->ct
.mark
= newMark
;
705 OvsConntrackSetLabels(OvsFlowKey
*key
,
707 struct ovs_key_ct_labels
*val
,
708 struct ovs_key_ct_labels
*mask
,
709 BOOLEAN
*labelChanged
)
711 ovs_u128 v
, m
, pktMdLabel
= {0};
712 memcpy(&v
, val
, sizeof v
);
713 memcpy(&m
, mask
, sizeof m
);
715 pktMdLabel
.u64
.lo
= v
.u64
.lo
| (pktMdLabel
.u64
.lo
& ~(m
.u64
.lo
));
716 pktMdLabel
.u64
.hi
= v
.u64
.hi
| (pktMdLabel
.u64
.hi
& ~(m
.u64
.hi
));
718 if (!NdisEqualMemory(&entry
->labels
, &pktMdLabel
,
719 sizeof(struct ovs_key_ct_labels
))) {
720 *labelChanged
= TRUE
;
722 NdisMoveMemory(&entry
->labels
, &pktMdLabel
,
723 sizeof(struct ovs_key_ct_labels
));
724 NdisMoveMemory(&key
->ct
.labels
, &pktMdLabel
,
725 sizeof(struct ovs_key_ct_labels
));
729 OvsCtSetMarkLabel(OvsFlowKey
*key
,
733 BOOLEAN
*triggerUpdateEvent
)
736 OvsConntrackSetMark(key
, entry
, mark
->value
, mark
->mask
,
741 OvsConntrackSetLabels(key
, entry
, &labels
->value
, &labels
->mask
,
747 OvsCtUpdateTuple(OvsFlowKey
*key
, OVS_CT_KEY
*ctKey
)
749 key
->ct
.tuple_ipv4
.ipv4_src
= ctKey
->src
.addr
.ipv4_aligned
;
750 key
->ct
.tuple_ipv4
.ipv4_dst
= ctKey
->dst
.addr
.ipv4_aligned
;
751 key
->ct
.tuple_ipv4
.ipv4_proto
= ctKey
->nw_proto
;
753 /* Orig tuple Port is overloaded to take in ICMP-Type & Code */
754 /* This mimics the behavior in lib/conntrack.c*/
755 key
->ct
.tuple_ipv4
.src_port
= ctKey
->nw_proto
!= IPPROTO_ICMP
?
757 htons(ctKey
->src
.icmp_type
);
758 key
->ct
.tuple_ipv4
.dst_port
= ctKey
->nw_proto
!= IPPROTO_ICMP
?
760 htons(ctKey
->src
.icmp_code
);
763 static __inline NDIS_STATUS
764 OvsCtExecute_(OvsForwardingContext
*fwdCtx
,
766 OVS_PACKET_HDR_INFO
*layers
,
773 PNAT_ACTION_INFO natInfo
,
774 BOOLEAN postUpdateEvent
)
776 NDIS_STATUS status
= NDIS_STATUS_SUCCESS
;
777 BOOLEAN triggerUpdateEvent
= FALSE
;
778 POVS_CT_ENTRY entry
= NULL
;
779 PNET_BUFFER_LIST curNbl
= fwdCtx
->curNbl
;
780 OvsConntrackKeyLookupCtx ctx
= { 0 };
781 LOCK_STATE_EX lockState
;
783 NdisGetCurrentSystemTime((LARGE_INTEGER
*) ¤tTime
);
786 /* Retrieve the Conntrack Key related fields from packet */
787 OvsCtSetupLookupCtx(key
, zone
, &ctx
, curNbl
, layers
->l4Offset
);
789 NdisAcquireRWLockWrite(ovsConntrackLockObj
, &lockState
, 0);
791 /* Lookup Conntrack entries for a matching entry */
792 entry
= OvsCtLookup(&ctx
);
793 BOOLEAN entryCreated
= FALSE
;
795 /* Delete entry in reverse direction if 'force' is specified */
796 if (entry
&& force
&& ctx
.reply
) {
797 OvsCtEntryDelete(entry
, TRUE
);
801 if (!entry
&& commit
&& ctTotalEntries
>= CT_MAX_ENTRIES
) {
802 /* Don't proceed with processing if the max limit has been hit.
803 * This blocks only new entries from being created and doesn't
804 * affect existing connections.
806 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
807 OVS_LOG_ERROR("Conntrack Limit hit: %lu", ctTotalEntries
);
808 return NDIS_STATUS_RESOURCES
;
811 /* Increment the counters soon after the lookup, since we set ct.state
812 * to OVS_CS_F_TRACKED after processing the ct entry.
814 if (entry
&& (!(key
->ct
.state
& OVS_CS_F_TRACKED
))) {
815 OvsCtIncrementCounters(entry
, ctx
.reply
, curNbl
);
819 /* If no matching entry was found, create one and add New state */
820 entry
= OvsCtEntryCreate(fwdCtx
, key
->ipKey
.nwProto
,
821 layers
->l4Offset
, &ctx
,
822 key
, natInfo
, commit
, currentTime
,
825 /* Process the entry and update CT flags */
826 entry
= OvsProcessConntrackEntry(fwdCtx
, layers
->l4Offset
, &ctx
, key
,
827 zone
, natInfo
, commit
, currentTime
,
835 * Note that natInfo is not the same as entry->natInfo here. natInfo
836 * is decided by action in the openflow rule, entry->natInfo is decided
837 * when the entry is created. In the reverse NAT case, natInfo is
838 * NAT_ACTION_REVERSE, yet entry->natInfo is NAT_ACTION_SRC or
839 * NAT_ACTION_DST without NAT_ACTION_REVERSE
841 if (natInfo
->natAction
!= NAT_ACTION_NONE
)
843 LOCK_STATE_EX lockStateNat
;
844 NdisAcquireRWLockWrite(ovsCtNatLockObj
, &lockStateNat
, 0);
845 OvsNatPacket(fwdCtx
, entry
, entry
->natInfo
.natAction
,
847 NdisReleaseRWLock(ovsCtNatLockObj
, &lockStateNat
);
850 OvsCtSetMarkLabel(key
, entry
, mark
, labels
, &triggerUpdateEvent
);
852 if (OvsDetectFtpPacket(key
)) {
853 /* FTP parser will always be loaded */
854 UNREFERENCED_PARAMETER(helper
);
856 status
= OvsCtHandleFtp(curNbl
, key
, layers
, currentTime
, entry
,
857 (ntohs(key
->ipKey
.l4
.tpDst
) == IPPORT_FTP
));
858 if (status
!= NDIS_STATUS_SUCCESS
) {
859 OVS_LOG_ERROR("Error while parsing the FTP packet");
863 /* Add original tuple information to flow Key */
864 if (entry
->key
.dl_type
== ntohs(ETH_TYPE_IPV4
)) {
865 if (entry
->parent
!= NULL
) {
866 POVS_CT_ENTRY parent
= entry
->parent
;
867 OvsCtUpdateTuple(key
, &parent
->key
);
869 OvsCtUpdateTuple(key
, &entry
->key
);
874 OvsPostCtEventEntry(entry
, OVS_EVENT_CT_NEW
);
876 if (postUpdateEvent
&& !entryCreated
&& triggerUpdateEvent
) {
877 OvsPostCtEventEntry(entry
, OVS_EVENT_CT_UPDATE
);
880 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
886 *---------------------------------------------------------------------------
887 * OvsExecuteConntrackAction
888 * Executes Conntrack actions XXX - Add more
889 * For the Ipv4 fragments, consume the orginal fragment NBL
890 *---------------------------------------------------------------------------
893 OvsExecuteConntrackAction(OvsForwardingContext
*fwdCtx
,
898 BOOLEAN commit
= FALSE
;
899 BOOLEAN force
= FALSE
;
900 BOOLEAN postUpdateEvent
= FALSE
;
902 UINT32 eventmask
= 0;
903 MD_MARK
*mark
= NULL
;
904 MD_LABELS
*labels
= NULL
;
906 NAT_ACTION_INFO natActionInfo
;
907 OVS_PACKET_HDR_INFO
*layers
= &fwdCtx
->layers
;
908 PNET_BUFFER_LIST newNbl
= NULL
;
911 memset(&natActionInfo
, 0, sizeof natActionInfo
);
912 status
= OvsDetectCtPacket(fwdCtx
, key
, &newNbl
);
913 if (status
!= NDIS_STATUS_SUCCESS
) {
917 /* XXX Convert this to NL_ATTR_FOR_EACH */
918 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_ZONE
);
920 zone
= NlAttrGetU16(ctAttr
);
922 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_COMMIT
);
926 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_MARK
);
928 mark
= NlAttrGet(ctAttr
);
930 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_LABELS
);
932 labels
= NlAttrGet(ctAttr
);
934 natActionInfo
.natAction
= NAT_ACTION_NONE
;
935 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_NAT
);
937 /* Pares Nested NAT attributes. */
940 BOOLEAN hasMinIp
= FALSE
;
941 BOOLEAN hasMinPort
= FALSE
;
942 BOOLEAN hasMaxIp
= FALSE
;
943 BOOLEAN hasMaxPort
= FALSE
;
944 NL_NESTED_FOR_EACH_UNSAFE (natAttr
, left
, ctAttr
) {
945 enum ovs_nat_attr subtype
= NlAttrType(natAttr
);
947 case OVS_NAT_ATTR_SRC
:
948 case OVS_NAT_ATTR_DST
:
949 natActionInfo
.natAction
|=
950 ((subtype
== OVS_NAT_ATTR_SRC
)
951 ? NAT_ACTION_SRC
: NAT_ACTION_DST
);
953 case OVS_NAT_ATTR_IP_MIN
:
954 memcpy(&natActionInfo
.minAddr
,
955 NlAttrData(natAttr
), NlAttrGetSize(natAttr
));
958 case OVS_NAT_ATTR_IP_MAX
:
959 memcpy(&natActionInfo
.maxAddr
,
960 NlAttrData(natAttr
), NlAttrGetSize(natAttr
));
963 case OVS_NAT_ATTR_PROTO_MIN
:
964 natActionInfo
.minPort
= NlAttrGetU16(natAttr
);
967 case OVS_NAT_ATTR_PROTO_MAX
:
968 natActionInfo
.maxPort
= NlAttrGetU16(natAttr
);
971 case OVS_NAT_ATTR_PERSISTENT
:
972 case OVS_NAT_ATTR_PROTO_HASH
:
973 case OVS_NAT_ATTR_PROTO_RANDOM
:
977 if (natActionInfo
.natAction
== NAT_ACTION_NONE
) {
978 natActionInfo
.natAction
= NAT_ACTION_REVERSE
;
980 if (hasMinIp
&& !hasMaxIp
) {
981 memcpy(&natActionInfo
.maxAddr
,
982 &natActionInfo
.minAddr
,
983 sizeof(natActionInfo
.maxAddr
));
985 if (hasMinPort
&& !hasMaxPort
) {
986 natActionInfo
.maxPort
= natActionInfo
.minPort
;
988 if (hasMinPort
|| hasMaxPort
) {
989 if (natActionInfo
.natAction
& NAT_ACTION_SRC
) {
990 natActionInfo
.natAction
|= NAT_ACTION_SRC_PORT
;
991 } else if (natActionInfo
.natAction
& NAT_ACTION_DST
) {
992 natActionInfo
.natAction
|= NAT_ACTION_DST_PORT
;
996 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_HELPER
);
998 helper
= NlAttrGetString(ctAttr
);
999 if (helper
== NULL
) {
1000 return NDIS_STATUS_INVALID_PARAMETER
;
1002 if (strcmp("ftp", helper
) != 0) {
1003 /* Only support FTP */
1004 return NDIS_STATUS_NOT_SUPPORTED
;
1007 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_FORCE_COMMIT
);
1010 /* Force implicitly means commit */
1013 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_EVENTMASK
);
1015 eventmask
= NlAttrGetU32(ctAttr
);
1016 /* Only mark and label updates are supported. */
1017 if (eventmask
& (1 << IPCT_MARK
| 1 << IPCT_LABEL
))
1018 postUpdateEvent
= TRUE
;
1020 /* If newNbl is not allocated, use the current Nbl*/
1021 status
= OvsCtExecute_(fwdCtx
, key
, layers
,
1022 commit
, force
, zone
, mark
, labels
, helper
, &natActionInfo
,
1028 *----------------------------------------------------------------------------
1029 * OvsConntrackEntryCleaner
1030 * Runs periodically and cleans up the connection tracker
1031 *----------------------------------------------------------------------------
1034 OvsConntrackEntryCleaner(PVOID data
)
1037 POVS_CT_THREAD_CTX context
= (POVS_CT_THREAD_CTX
)data
;
1038 PLIST_ENTRY link
, next
;
1039 POVS_CT_ENTRY entry
;
1040 LOCK_STATE_EX lockState
;
1041 BOOLEAN success
= TRUE
;
1044 if (ovsConntrackLockObj
== NULL
) {
1045 /* Lock has been freed by 'OvsCleanupConntrack()' */
1048 NdisAcquireRWLockWrite(ovsConntrackLockObj
, &lockState
, 0);
1049 if (context
->exit
) {
1050 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
1054 /* Set the timeout for the thread and cleanup */
1055 INT64 threadSleepTimeout
= -CT_CLEANUP_INTERVAL
;
1057 if (ctTotalEntries
) {
1058 for (int i
= 0; i
< CT_HASH_TABLE_SIZE
; i
++) {
1059 LIST_FORALL_SAFE(&ovsConntrackTable
[i
], link
, next
) {
1060 entry
= CONTAINING_RECORD(link
, OVS_CT_ENTRY
, link
);
1061 OvsCtEntryDelete(entry
, FALSE
);
1065 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
1066 KeWaitForSingleObject(&context
->event
, Executive
, KernelMode
,
1067 FALSE
, (LARGE_INTEGER
*)&threadSleepTimeout
);
1070 PsTerminateSystemThread(STATUS_SUCCESS
);
1074 *----------------------------------------------------------------------------
1076 * Flushes out all Conntrack Entries that match any of the arguments
1077 *----------------------------------------------------------------------------
1079 static __inline NDIS_STATUS
1080 OvsCtFlush(UINT16 zone
, struct ovs_key_ct_tuple_ipv4
*tuple
)
1082 PLIST_ENTRY link
, next
;
1083 POVS_CT_ENTRY entry
;
1085 LOCK_STATE_EX lockState
, lockStateNat
;
1086 NdisAcquireRWLockWrite(ovsConntrackLockObj
, &lockState
, 0);
1088 if (ctTotalEntries
) {
1089 for (UINT32 i
= 0; i
< CT_HASH_TABLE_SIZE
; i
++) {
1090 LIST_FORALL_SAFE(&ovsConntrackTable
[i
], link
, next
) {
1091 entry
= CONTAINING_RECORD(link
, OVS_CT_ENTRY
, link
);
1093 if (tuple
->ipv4_proto
!= IPPROTO_ICMP
&&
1094 tuple
->ipv4_src
== entry
->key
.src
.addr
.ipv4_aligned
&&
1095 tuple
->ipv4_dst
== entry
->key
.dst
.addr
.ipv4_aligned
&&
1096 tuple
->ipv4_proto
== entry
->key
.nw_proto
&&
1097 tuple
->src_port
== entry
->key
.src
.port
&&
1098 tuple
->dst_port
== entry
->key
.dst
.port
&&
1099 (zone
? entry
->key
.zone
== zone
: TRUE
)) {
1100 OvsCtEntryDelete(entry
, TRUE
);
1101 } else if (tuple
->ipv4_src
== entry
->key
.src
.addr
.ipv4_aligned
&&
1102 tuple
->ipv4_dst
== entry
->key
.dst
.addr
.ipv4_aligned
&&
1103 tuple
->ipv4_proto
== entry
->key
.nw_proto
&&
1104 tuple
->src_port
== entry
->key
.src
.icmp_type
&&
1105 tuple
->dst_port
== entry
->key
.src
.icmp_code
&&
1106 (zone
? entry
->key
.zone
== zone
: TRUE
)) {
1107 OvsCtEntryDelete(entry
, TRUE
);
1109 } else if (!zone
|| zone
== entry
->key
.zone
) {
1110 OvsCtEntryDelete(entry
, TRUE
);
1116 NdisAcquireRWLockWrite(ovsCtNatLockObj
, &lockStateNat
, 0);
1118 NdisReleaseRWLock(ovsCtNatLockObj
, &lockStateNat
);
1119 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
1120 return NDIS_STATUS_SUCCESS
;
1124 OvsCtDeleteCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx
,
1127 POVS_MESSAGE msgIn
= (POVS_MESSAGE
)usrParamsCtx
->inputBuffer
;
1128 POVS_MESSAGE msgOut
= (POVS_MESSAGE
)usrParamsCtx
->outputBuffer
;
1129 PNL_MSG_HDR nlMsgHdr
= &(msgIn
->nlMsg
);
1130 PNL_ATTR ctAttrs
[__CTA_MAX
];
1131 UINT32 attrOffset
= NLMSG_HDRLEN
+ NF_GEN_MSG_HDRLEN
+ OVS_HDRLEN
;
1132 NL_ERROR nlError
= NL_ERROR_SUCCESS
;
1135 struct ovs_key_ct_tuple_ipv4
*ct_tuple
= NULL
;
1140 static const NL_POLICY ctAttrPolicy
[] = {
1141 [CTA_TUPLE_ORIG
] = {.type
= NL_A_NESTED
, .optional
= TRUE
},
1142 [CTA_ZONE
] = {.type
= NL_A_BE16
, .optional
= TRUE
},
1145 if ((NlAttrParse(nlMsgHdr
, attrOffset
, NlNfMsgAttrsLen(nlMsgHdr
),
1146 ctAttrPolicy
, ARRAY_SIZE(ctAttrPolicy
),
1147 ctAttrs
, ARRAY_SIZE(ctAttrs
)))
1149 OVS_LOG_ERROR("Ct attr parsing failed for msg: %p", nlMsgHdr
);
1150 status
= STATUS_INVALID_PARAMETER
;
1154 if (ctAttrs
[CTA_ZONE
]) {
1155 zone
= ntohs(NlAttrGetU16(ctAttrs
[CTA_ZONE
]));
1158 if (ctAttrs
[CTA_TUPLE_ORIG
]) {
1159 ct_tuple
= OvsAllocateMemoryWithTag(sizeof(struct ovs_key_ct_tuple_ipv4
),
1161 if (ct_tuple
== NULL
) {
1162 status
= STATUS_INSUFFICIENT_RESOURCES
;
1165 /* Parse ct tuple. */
1166 status
= MapNlToCtTuple(msgIn
, ctAttrs
[CTA_TUPLE_ORIG
], ct_tuple
);
1167 if (status
!= STATUS_SUCCESS
) {
1172 status
= OvsCtFlush(zone
, ct_tuple
);
1173 if (status
== STATUS_SUCCESS
) {
1174 nlmsgType
= (NFNL_SUBSYS_CTNETLINK
<< 8 | IPCTNL_MSG_CT_DELETE
);
1176 usrParamsCtx
->outputBuffer
,
1177 usrParamsCtx
->outputLength
);
1178 if (!NlFillOvsMsgForNfGenMsg(&nlBuf
, nlmsgType
, NLM_F_CREATE
,
1179 msgIn
->nlMsg
.nlmsgSeq
,
1180 msgIn
->nlMsg
.nlmsgPid
,
1182 msgIn
->nfGenMsg
.version
,
1184 status
= STATUS_INVALID_PARAMETER
;
1186 nlMsg
= (PNL_MSG_HDR
)NlBufAt(&nlBuf
, 0, 0);
1187 nlMsg
->nlmsgLen
= NlBufSize(&nlBuf
);
1188 *replyLen
= msgOut
->nlMsg
.nlmsgLen
;
1193 OvsFreeMemoryWithTag(ct_tuple
, OVS_CT_POOL_TAG
);
1196 nlError
= NlMapStatusToNlErr(status
);
1197 if (nlError
!= NL_ERROR_SUCCESS
) {
1198 POVS_MESSAGE_ERROR msgError
= (POVS_MESSAGE_ERROR
)
1199 usrParamsCtx
->outputBuffer
;
1202 NlBuildErrorMsg(msgIn
, msgError
, nlError
, replyLen
);
1203 ASSERT(*replyLen
!= 0);
1204 status
= STATUS_SUCCESS
;
1210 static __inline NDIS_STATUS
1211 MapNlToCtTuple(POVS_MESSAGE msgIn
, PNL_ATTR ctAttr
,
1212 struct ovs_key_ct_tuple_ipv4
*ct_tuple
) {
1214 PNL_MSG_HDR nlMsgHdr
= &(msgIn
->nlMsg
);
1215 PNL_ATTR ctTupleAttrs
[__CTA_MAX
];
1217 static const NL_POLICY ctTuplePolicy
[] = {
1218 [CTA_TUPLE_IP
] = {.type
= NL_A_NESTED
, .optional
= FALSE
},
1219 [CTA_TUPLE_PROTO
] = {.type
= NL_A_NESTED
, .optional
= FALSE
},
1222 static const NL_POLICY ctTupleIpPolicy
[] = {
1223 [CTA_IP_V4_SRC
] = { .type
= NL_A_BE32
, .optional
= TRUE
},
1224 [CTA_IP_V4_DST
] = { .type
= NL_A_BE32
, .optional
= TRUE
},
1227 static const NL_POLICY ctTupleProtoPolicy
[] = {
1228 [CTA_PROTO_NUM
] = { .type
= NL_A_U8
, .optional
= FALSE
},
1229 [CTA_PROTO_SRC_PORT
] = { .type
= NL_A_BE16
, .optional
= TRUE
},
1230 [CTA_PROTO_DST_PORT
] = { .type
= NL_A_BE16
, .optional
= TRUE
},
1231 [CTA_PROTO_ICMP_TYPE
] = { .type
= NL_A_U8
, .optional
= TRUE
},
1232 [CTA_PROTO_ICMP_CODE
] = { .type
= NL_A_U8
, .optional
= TRUE
},
1236 return STATUS_INVALID_PARAMETER
;
1239 attrOffset
= (UINT32
)((PCHAR
) ctAttr
- (PCHAR
)nlMsgHdr
);
1240 if ((NlAttrParseNested(nlMsgHdr
, attrOffset
, NlAttrLen(ctAttr
),
1241 ctTuplePolicy
, ARRAY_SIZE(ctTuplePolicy
),
1242 ctTupleAttrs
, ARRAY_SIZE(ctTupleAttrs
)))
1244 OVS_LOG_ERROR("CTA_TUPLE attr parsing failed for msg: %p", nlMsgHdr
);
1245 return STATUS_INVALID_PARAMETER
;
1248 if (ctTupleAttrs
[CTA_TUPLE_IP
]) {
1249 PNL_ATTR ctTupleIpAttrs
[__CTA_MAX
];
1250 attrOffset
= (UINT32
)((PCHAR
) ctTupleAttrs
[CTA_TUPLE_IP
] - (PCHAR
)nlMsgHdr
);
1251 if ((NlAttrParseNested(nlMsgHdr
, attrOffset
, NlAttrLen(ctTupleAttrs
[CTA_TUPLE_IP
]),
1252 ctTupleIpPolicy
, ARRAY_SIZE(ctTupleIpPolicy
),
1253 ctTupleIpAttrs
, ARRAY_SIZE(ctTupleIpAttrs
)))
1255 OVS_LOG_ERROR("CTA_TUPLE_IP attr parsing failed for msg: %p", nlMsgHdr
);
1256 return STATUS_INVALID_PARAMETER
;
1259 if (ctTupleIpAttrs
[CTA_IP_V4_SRC
] && ctTupleIpAttrs
[CTA_IP_V4_DST
]) {
1260 ct_tuple
->ipv4_src
= NlAttrGetU32(ctTupleIpAttrs
[CTA_IP_V4_SRC
]);
1261 ct_tuple
->ipv4_dst
= NlAttrGetU32(ctTupleIpAttrs
[CTA_IP_V4_DST
]);
1265 if (ctTupleAttrs
[CTA_TUPLE_PROTO
]) {
1266 PNL_ATTR ctTupleProtoAttrs
[__CTA_MAX
];
1267 attrOffset
= (UINT32
)((PCHAR
) ctTupleAttrs
[CTA_TUPLE_PROTO
] - (PCHAR
)nlMsgHdr
);
1268 if ((NlAttrParseNested(nlMsgHdr
, attrOffset
, NlAttrLen(ctTupleAttrs
[CTA_TUPLE_PROTO
]),
1269 ctTupleProtoPolicy
, ARRAY_SIZE(ctTupleProtoPolicy
),
1270 ctTupleProtoAttrs
, ARRAY_SIZE(ctTupleProtoAttrs
)))
1272 OVS_LOG_ERROR("CTA_TUPLE_PROTO attr parsing failed for msg: %p", nlMsgHdr
);
1273 return STATUS_INVALID_PARAMETER
;
1276 if (ctTupleProtoAttrs
[CTA_PROTO_NUM
]) {
1277 ct_tuple
->ipv4_proto
= NlAttrGetU8 (ctTupleProtoAttrs
[CTA_PROTO_NUM
]);
1278 if (ctTupleProtoAttrs
[CTA_PROTO_SRC_PORT
] && ctTupleProtoAttrs
[CTA_PROTO_DST_PORT
]) {
1279 ct_tuple
->src_port
= NlAttrGetU16(ctTupleProtoAttrs
[CTA_PROTO_SRC_PORT
]);
1280 ct_tuple
->dst_port
= NlAttrGetU16(ctTupleProtoAttrs
[CTA_PROTO_DST_PORT
]);
1281 } else if (ctTupleProtoAttrs
[CTA_PROTO_ICMP_TYPE
] &&
1282 ctTupleProtoAttrs
[CTA_PROTO_ICMP_CODE
] ) {
1283 ct_tuple
->src_port
= NlAttrGetU8(ctTupleProtoAttrs
[CTA_PROTO_ICMP_TYPE
]);
1284 ct_tuple
->dst_port
= NlAttrGetU8(ctTupleProtoAttrs
[CTA_PROTO_ICMP_CODE
]);
1290 return NDIS_STATUS_SUCCESS
;
1293 static __inline NDIS_STATUS
1294 MapIpTupleToNl(PNL_BUFFER nlBuf
, OVS_CT_KEY
*key
)
1296 NDIS_STATUS status
= NDIS_STATUS_SUCCESS
;
1299 offset
= NlMsgStartNested(nlBuf
, CTA_TUPLE_IP
);
1301 return NDIS_STATUS_FAILURE
;
1304 if (key
->dl_type
== ntohs(ETH_TYPE_IPV4
)) {
1305 if (!NlMsgPutTailU32(nlBuf
, CTA_IP_V4_SRC
, key
->src
.addr
.ipv4
)) {
1306 status
= NDIS_STATUS_FAILURE
;
1309 if (!NlMsgPutTailU32(nlBuf
, CTA_IP_V4_DST
, key
->dst
.addr
.ipv4
)) {
1310 status
= NDIS_STATUS_FAILURE
;
1313 } else if (key
->dl_type
== ntohs(ETH_TYPE_IPV6
)) {
1314 if (!NlMsgPutTailUnspec(nlBuf
, CTA_IP_V6_SRC
,
1315 (PCHAR
)(&key
->src
.addr
.ipv6
),
1316 sizeof(key
->src
.addr
.ipv6
))) {
1317 status
= NDIS_STATUS_FAILURE
;
1320 if (!NlMsgPutTailUnspec(nlBuf
, CTA_IP_V6_DST
,
1321 (PCHAR
)(&key
->dst
.addr
.ipv6
),
1322 sizeof(key
->dst
.addr
.ipv6
))) {
1323 status
= NDIS_STATUS_FAILURE
;
1329 NlMsgEndNested(nlBuf
, offset
);
1333 static __inline NDIS_STATUS
1334 MapProtoTupleToNl(PNL_BUFFER nlBuf
, OVS_CT_KEY
*key
)
1336 NDIS_STATUS status
= NDIS_STATUS_SUCCESS
;
1339 offset
= NlMsgStartNested(nlBuf
, CTA_TUPLE_PROTO
);
1341 return NDIS_STATUS_FAILURE
;
1344 if (!NlMsgPutTailU8(nlBuf
, CTA_PROTO_NUM
, key
->nw_proto
)) {
1345 status
= NDIS_STATUS_FAILURE
;
1349 if (key
->dl_type
== ntohs(ETH_TYPE_IPV4
)
1350 || key
->dl_type
== ntohs(ETH_TYPE_IPV6
)) {
1351 /* ICMP and ICMPv6 Type, Code and ID are currently not tracked */
1352 if (key
->nw_proto
== IPPROTO_ICMP
) {
1353 if (!NlMsgPutTailU16(nlBuf
, CTA_PROTO_ICMP_ID
,
1354 htons(key
->src
.icmp_id
))) {
1355 status
= NDIS_STATUS_FAILURE
;
1358 if (!NlMsgPutTailU8(nlBuf
, CTA_PROTO_ICMP_TYPE
,
1359 key
->src
.icmp_type
)) {
1360 status
= NDIS_STATUS_FAILURE
;
1363 if (!NlMsgPutTailU8(nlBuf
, CTA_PROTO_ICMP_CODE
,
1364 key
->src
.icmp_code
)) {
1365 status
= NDIS_STATUS_FAILURE
;
1368 } else if (key
->nw_proto
== IPPROTO_ICMPV6
) {
1369 if (!NlMsgPutTailU16(nlBuf
, CTA_PROTO_ICMPV6_ID
, 0)) {
1370 status
= NDIS_STATUS_FAILURE
;
1373 if (!NlMsgPutTailU8(nlBuf
, CTA_PROTO_ICMPV6_TYPE
, 0)) {
1374 status
= NDIS_STATUS_FAILURE
;
1377 if (!NlMsgPutTailU8(nlBuf
, CTA_PROTO_ICMPV6_CODE
, 0)) {
1378 status
= NDIS_STATUS_FAILURE
;
1381 } else if (key
->nw_proto
== IPPROTO_TCP
1382 || key
->nw_proto
== IPPROTO_UDP
) {
1383 if (!NlMsgPutTailU16(nlBuf
, CTA_PROTO_SRC_PORT
,
1385 status
= NDIS_STATUS_FAILURE
;
1388 if (!NlMsgPutTailU16(nlBuf
, CTA_PROTO_DST_PORT
,
1390 status
= NDIS_STATUS_FAILURE
;
1397 NlMsgEndNested(nlBuf
, offset
);
1401 static __inline NDIS_STATUS
1402 MapCtKeyTupleToNl(PNL_BUFFER nlBuf
,
1406 NDIS_STATUS status
= NDIS_STATUS_SUCCESS
;
1409 offset
= NlMsgStartNested(nlBuf
, tupleType
);
1411 return NDIS_STATUS_FAILURE
;
1414 status
= MapIpTupleToNl(nlBuf
, key
);
1415 if (status
!= NDIS_STATUS_SUCCESS
) {
1419 status
= MapProtoTupleToNl(nlBuf
, key
);
1420 if (status
!= NDIS_STATUS_SUCCESS
) {
1425 NlMsgEndNested(nlBuf
, offset
);
1429 static __inline NDIS_STATUS
1430 MapCtCounterToNl(PNL_BUFFER nlBuf
,
1434 NDIS_STATUS status
= NDIS_STATUS_SUCCESS
;
1437 offset
= NlMsgStartNested(nlBuf
, counterType
);
1439 return NDIS_STATUS_FAILURE
;
1442 if (!NlMsgPutTailU64(nlBuf
, CTA_COUNTERS_PACKETS
,
1443 htonll(key
->packetCount
))) {
1444 status
= NDIS_STATUS_FAILURE
;
1448 if (!NlMsgPutTailU64(nlBuf
, CTA_COUNTERS_BYTES
,
1449 htonll(key
->byteCount
))) {
1450 status
= NDIS_STATUS_FAILURE
;
1455 NlMsgEndNested(nlBuf
, offset
);
1459 /* Userspace expects system time to be Unix timestamp in Nano Seconds */
1460 static __inline
unsigned
1461 WindowsTickToUnixSeconds(long long windowsTicks
)
1464 * Windows epoch starts 1601-01-01T00:00:00Z. It's 11644473600 seconds
1465 * before the UNIX/Linux epoch (1970-01-01T00:00:00Z). Windows ticks are
1466 * in 100 nanoseconds
1468 return (unsigned)((windowsTicks
/ WINDOWS_TICK
1469 - SEC_TO_UNIX_EPOCH
));
1473 OvsCreateNlMsgFromCtEntry(POVS_CT_ENTRY entry
,
1487 UINT64 currentTime
, expiration
;
1489 UINT16 nlmsgFlags
= NLM_F_CREATE
;
1490 NdisGetCurrentSystemTime((LARGE_INTEGER
*)¤tTime
);
1491 UINT8 nfgenFamily
= 0;
1493 if (entry
->key
.dl_type
== htons(ETH_TYPE_IPV4
)) {
1494 nfgenFamily
= AF_INET
;
1495 } else if (entry
->key
.dl_type
== htons(ETH_TYPE_IPV6
)) {
1496 nfgenFamily
= AF_INET6
;
1499 NlBufInit(&nlBuf
, outBuffer
, outBufLen
);
1500 /* Mimic netfilter */
1501 if (eventType
== OVS_EVENT_CT_NEW
|| eventType
== OVS_EVENT_CT_UPDATE
) {
1502 nlmsgType
= (UINT16
) (NFNL_SUBSYS_CTNETLINK
<< 8 | IPCTNL_MSG_CT_NEW
);
1503 } else if (eventType
== OVS_EVENT_CT_DELETE
) {
1504 nlmsgType
= (UINT16
) (NFNL_SUBSYS_CTNETLINK
<< 8 | IPCTNL_MSG_CT_DELETE
);
1506 return STATUS_INVALID_PARAMETER
;
1509 if (eventType
== OVS_EVENT_CT_UPDATE
) {
1510 /* In netlink-conntrack.c IPCTNL_MSG_CT_NEW msg type is used to
1511 * differentiate between OVS_EVENT_CT_NEW and OVS_EVENT_CT_UPDATE
1512 * events based on nlmsgFlags, unset it to notify an update event.
1516 ok
= NlFillOvsMsgForNfGenMsg(&nlBuf
, nlmsgType
, nlmsgFlags
,
1517 nlmsgSeq
, nlmsgPid
, nfgenFamily
,
1518 nfGenVersion
, dpIfIndex
);
1520 return STATUS_INVALID_BUFFER_SIZE
;
1523 status
= MapCtKeyTupleToNl(&nlBuf
, CTA_TUPLE_ORIG
, &entry
->key
);
1524 if (status
!= NDIS_STATUS_SUCCESS
) {
1525 return STATUS_UNSUCCESSFUL
;
1528 status
= MapCtKeyTupleToNl(&nlBuf
, CTA_TUPLE_REPLY
, &entry
->rev_key
);
1529 if (status
!= NDIS_STATUS_SUCCESS
) {
1530 return STATUS_UNSUCCESSFUL
;
1533 status
= MapCtCounterToNl(&nlBuf
, CTA_COUNTERS_ORIG
, &entry
->key
);
1534 if (status
!= NDIS_STATUS_SUCCESS
) {
1535 return STATUS_UNSUCCESSFUL
;
1538 status
= MapCtCounterToNl(&nlBuf
, CTA_COUNTERS_REPLY
, &entry
->rev_key
);
1539 if (status
!= NDIS_STATUS_SUCCESS
) {
1540 return STATUS_UNSUCCESSFUL
;
1543 if (entry
->key
.zone
) {
1544 if (!NlMsgPutTailU16(&nlBuf
, CTA_ZONE
, htons(entry
->key
.zone
))) {
1545 return STATUS_INVALID_BUFFER_SIZE
;
1550 if (!NlMsgPutTailU32(&nlBuf
, CTA_MARK
, htonl(entry
->mark
))) {
1551 return STATUS_INVALID_BUFFER_SIZE
;
1555 if (entry
->labels
.ct_labels
) {
1556 ok
= NlMsgPutTailUnspec(&nlBuf
, CTA_LABELS
,
1557 (PCHAR
)(&entry
->labels
),
1558 sizeof(entry
->labels
));
1560 return STATUS_INVALID_BUFFER_SIZE
;
1564 if (entry
->expiration
> currentTime
) {
1565 expiration
= entry
->expiration
- currentTime
;
1566 timeout
= (UINT32
) (expiration
/ CT_INTERVAL_SEC
);
1567 if (!NlMsgPutTailU32(&nlBuf
, CTA_TIMEOUT
, htonl(timeout
))) {
1568 return STATUS_INVALID_BUFFER_SIZE
;
1572 if (entry
->key
.nw_proto
== IPPROTO_TCP
) {
1573 /* Add ProtoInfo for TCP */
1575 offset
= NlMsgStartNested(&nlBuf
, CTA_PROTOINFO
);
1577 return NDIS_STATUS_FAILURE
;
1580 status
= OvsCtMapTcpProtoInfoToNl(&nlBuf
, entry
);
1581 NlMsgEndNested(&nlBuf
, offset
);
1582 if (status
!= NDIS_STATUS_SUCCESS
) {
1583 return STATUS_UNSUCCESSFUL
;
1587 /* CTA_STATUS is required but not implemented. Default to 0 */
1588 if (!NlMsgPutTailU32(&nlBuf
, CTA_STATUS
, 0)) {
1589 return STATUS_INVALID_BUFFER_SIZE
;
1592 /* Mimic netfilter - nf_conntrack_netlink.c:
1594 * int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) {
1595 * NLA_PUT_BE32(skb, CTA_ID, htonl((unsigned long)ct));
1600 if(!NlMsgPutTailU32(&nlBuf
, CTA_ID
, htonl((UINT32
) entry
))) {
1601 return STATUS_INVALID_BUFFER_SIZE
;
1604 if (entry
->timestampStart
) {
1606 offset
= NlMsgStartNested(&nlBuf
, CTA_TIMESTAMP
);
1608 return NDIS_STATUS_FAILURE
;
1611 start
= WindowsTickToUnixSeconds(entry
->timestampStart
);
1612 start
= start
* SEC_TO_NANOSEC
;
1613 if (!NlMsgPutTailU64(&nlBuf
, CTA_TIMESTAMP_START
, htonll(start
))) {
1614 NlMsgEndNested(&nlBuf
, offset
);
1615 return STATUS_INVALID_BUFFER_SIZE
;
1618 NlMsgEndNested(&nlBuf
, offset
);
1621 nlMsg
= (PNL_MSG_HDR
)NlBufAt(&nlBuf
, 0, 0);
1622 nlMsg
->nlmsgLen
= NlBufSize(&nlBuf
);
1624 return STATUS_SUCCESS
;
1628 *----------------------------------------------------------------------------
1629 * OvsCtDumpCmdHandler --
1630 * Handler for IPCTNL_MSG_CT_GET command.
1632 * XXX - Try to consolidate dump handler patterns around dumpState usage
1633 * The following dumpHandler is similar to one vport.c uses
1634 *----------------------------------------------------------------------------
1637 OvsCtDumpCmdHandler(POVS_USER_PARAMS_CONTEXT usrParamsCtx
,
1641 /* Setup Dump Start if it's OVS_WRITE_DEV_OP and return */
1642 if (usrParamsCtx
->devOp
== OVS_WRITE_DEV_OP
) {
1644 OvsSetupDumpStart(usrParamsCtx
);
1645 return STATUS_SUCCESS
;
1648 POVS_OPEN_INSTANCE instance
=
1649 (POVS_OPEN_INSTANCE
)usrParamsCtx
->ovsInstance
;
1652 ASSERT(usrParamsCtx
->devOp
== OVS_READ_DEV_OP
);
1653 if (instance
->dumpState
.ovsMsg
== NULL
) {
1655 return STATUS_INVALID_DEVICE_STATE
;
1658 /* Output buffer has been validated while validating read dev op. */
1659 ASSERT(usrParamsCtx
->outputBuffer
!= NULL
);
1660 msgIn
= instance
->dumpState
.ovsMsg
;
1661 UINT32 inBucket
= instance
->dumpState
.index
[0];
1662 UINT32 inIndex
= instance
->dumpState
.index
[1];
1663 UINT32 i
= CT_HASH_TABLE_SIZE
;
1664 UINT32 outIndex
= 0;
1666 LOCK_STATE_EX lockState
;
1667 NdisAcquireRWLockRead(ovsConntrackLockObj
, &lockState
, 0);
1669 if (ctTotalEntries
) {
1670 for (i
= inBucket
; i
< CT_HASH_TABLE_SIZE
; i
++) {
1671 PLIST_ENTRY head
, link
;
1672 head
= &ovsConntrackTable
[i
];
1673 POVS_CT_ENTRY entry
= NULL
;
1676 LIST_FORALL(head
, link
) {
1678 * if one or more dumps were previously done on this same
1679 * bucket, inIndex will be > 0, so we'll need to reply with
1680 * the inIndex + 1 ct-entry from the bucket.
1682 if (outIndex
>= inIndex
) {
1683 entry
= CONTAINING_RECORD(link
, OVS_CT_ENTRY
, link
);
1685 rc
= OvsCreateNlMsgFromCtEntry(entry
,
1686 usrParamsCtx
->outputBuffer
,
1687 usrParamsCtx
->outputLength
,
1689 msgIn
->nlMsg
.nlmsgSeq
,
1690 msgIn
->nlMsg
.nlmsgPid
,
1691 msgIn
->nfGenMsg
.version
,
1694 if (rc
!= NDIS_STATUS_SUCCESS
) {
1695 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
1696 return STATUS_UNSUCCESSFUL
;
1711 * if no ct-entry was found above, check the next bucket, beginning
1712 * with the first (i.e. index 0) elem from within that bucket
1717 instance
->dumpState
.index
[0] = i
;
1718 instance
->dumpState
.index
[1] = outIndex
;
1719 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
1721 /* if i < CT_HASH_TABLE_SIZE => entry was found */
1722 if (i
< CT_HASH_TABLE_SIZE
) {
1723 POVS_MESSAGE msgOut
= (POVS_MESSAGE
)usrParamsCtx
->outputBuffer
;
1724 *replyLen
= msgOut
->nlMsg
.nlmsgLen
;
1726 /* if i >= CT_HASH_TABLE_SIZE => entry was not found => dump done */
1728 FreeUserDumpState(instance
);
1731 return STATUS_SUCCESS
;
1734 #pragma warning(pop)