2 * Copyright (c) 2015, 2016 VMware, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
20 #define OVS_DBG_MOD OVS_DBG_CONTRK
22 #include "Conntrack.h"
24 #include "PacketParser.h"
27 typedef struct _OVS_CT_THREAD_CTX
{
31 } OVS_CT_THREAD_CTX
, *POVS_CT_THREAD_CTX
;
33 KSTART_ROUTINE ovsConntrackEntryCleaner
;
34 static PLIST_ENTRY ovsConntrackTable
;
35 static OVS_CT_THREAD_CTX ctThreadCtx
;
36 static PNDIS_RW_LOCK_EX ovsConntrackLockObj
;
39 *----------------------------------------------------------------------------
41 * Initialize the components used by Connection Tracking
42 *----------------------------------------------------------------------------
45 OvsInitConntrack(POVS_SWITCH_CONTEXT context
)
48 HANDLE threadHandle
= NULL
;
50 /* Init the sync-lock */
51 ovsConntrackLockObj
= NdisAllocateRWLock(context
->NdisFilterHandle
);
52 if (ovsConntrackLockObj
== NULL
) {
53 return STATUS_INSUFFICIENT_RESOURCES
;
56 /* Init the Hash Buffer */
57 ovsConntrackTable
= OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY
)
60 if (ovsConntrackTable
== NULL
) {
61 NdisFreeRWLock(ovsConntrackLockObj
);
62 ovsConntrackLockObj
= NULL
;
63 return STATUS_INSUFFICIENT_RESOURCES
;
66 for (int i
= 0; i
< CT_HASH_TABLE_SIZE
; i
++) {
67 InitializeListHead(&ovsConntrackTable
[i
]);
70 /* Init CT Cleaner Thread */
71 KeInitializeEvent(&ctThreadCtx
.event
, NotificationEvent
, FALSE
);
72 status
= PsCreateSystemThread(&threadHandle
, SYNCHRONIZE
, NULL
, NULL
,
73 NULL
, ovsConntrackEntryCleaner
,
76 if (status
!= STATUS_SUCCESS
) {
77 NdisFreeRWLock(ovsConntrackLockObj
);
78 ovsConntrackLockObj
= NULL
;
80 OvsFreeMemoryWithTag(ovsConntrackTable
, OVS_CT_POOL_TAG
);
81 ovsConntrackTable
= NULL
;
86 ObReferenceObjectByHandle(threadHandle
, SYNCHRONIZE
, NULL
, KernelMode
,
87 &ctThreadCtx
.threadObject
, NULL
);
88 ZwClose(threadHandle
);
90 return STATUS_SUCCESS
;
94 *----------------------------------------------------------------------------
96 * Cleanup memory and thread that were spawned for Connection tracking
97 *----------------------------------------------------------------------------
100 OvsCleanupConntrack(VOID
)
102 LOCK_STATE_EX lockState
;
103 NdisAcquireRWLockWrite(ovsConntrackLockObj
, &lockState
, 0);
104 ctThreadCtx
.exit
= 1;
105 KeSetEvent(&ctThreadCtx
.event
, 0, FALSE
);
106 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
108 KeWaitForSingleObject(ctThreadCtx
.threadObject
, Executive
,
109 KernelMode
, FALSE
, NULL
);
110 ObDereferenceObject(ctThreadCtx
.threadObject
);
112 if (ovsConntrackTable
) {
113 OvsFreeMemoryWithTag(ovsConntrackTable
, OVS_CT_POOL_TAG
);
114 ovsConntrackTable
= NULL
;
117 NdisFreeRWLock(ovsConntrackLockObj
);
118 ovsConntrackLockObj
= NULL
;
122 OvsCtKeyReverse(OVS_CT_KEY
*key
)
124 struct ct_endpoint tmp
;
131 OvsCtUpdateFlowKey(struct OvsFlowKey
*key
,
135 struct ovs_key_ct_labels
*labels
)
137 key
->ct
.state
= state
| OVS_CS_F_TRACKED
;
141 NdisMoveMemory(&key
->ct
.labels
, labels
,
142 sizeof(struct ovs_key_ct_labels
));
144 memset(&key
->ct
.labels
, 0,
145 sizeof(struct ovs_key_ct_labels
));
150 OvsCtAddEntry(POVS_CT_ENTRY entry
, OvsConntrackKeyLookupCtx
*ctx
)
152 NdisMoveMemory(&entry
->key
, &ctx
->key
, sizeof (OVS_CT_KEY
));
153 NdisMoveMemory(&entry
->rev_key
, &ctx
->key
, sizeof (OVS_CT_KEY
));
154 OvsCtKeyReverse(&entry
->rev_key
);
155 InsertHeadList(&ovsConntrackTable
[ctx
->hash
& CT_HASH_TABLE_MASK
],
159 static __inline POVS_CT_ENTRY
160 OvsCtEntryCreate(PNET_BUFFER_LIST curNbl
,
163 OvsConntrackKeyLookupCtx
*ctx
,
168 POVS_CT_ENTRY entry
= NULL
;
176 tcp
= OvsGetTcp(curNbl
, l4Offset
, &tcpStorage
);
177 if (!OvsConntrackValidateTcpPacket(tcp
)) {
181 state
|= OVS_CS_F_NEW
;
183 entry
= OvsConntrackCreateTcpEntry(tcp
, curNbl
, currentTime
);
184 OvsCtAddEntry(entry
, ctx
);
187 OvsCtUpdateFlowKey(key
, state
, ctx
->key
.zone
, 0, NULL
);
192 state
|= OVS_CS_F_NEW
;
194 entry
= OvsConntrackCreateOtherEntry(currentTime
);
195 OvsCtAddEntry(entry
, ctx
);
198 OvsCtUpdateFlowKey(key
, state
, ctx
->key
.zone
, 0, NULL
);
205 state
|= OVS_CS_F_INVALID
;
206 OvsCtUpdateFlowKey(key
, state
, ctx
->key
.zone
, 0, NULL
);
210 static enum CT_UPDATE_RES
211 OvsCtUpdateEntry(OVS_CT_ENTRY
* entry
,
212 PNET_BUFFER_LIST nbl
,
224 tcp
= OvsGetTcp(nbl
, l4Offset
, &tcpStorage
);
226 return CT_UPDATE_INVALID
;
228 return OvsConntrackUpdateTcpEntry(entry
, tcp
, nbl
, reply
, now
);
232 return OvsConntrackUpdateOtherEntry(entry
, reply
, now
);
234 return CT_UPDATE_INVALID
;
239 OvsCtEntryDelete(POVS_CT_ENTRY entry
)
241 RemoveEntryList(&entry
->link
);
242 OvsFreeMemoryWithTag(entry
, OVS_CT_POOL_TAG
);
245 static __inline BOOLEAN
246 OvsCtEntryExpired(POVS_CT_ENTRY entry
)
253 NdisGetCurrentSystemTime((LARGE_INTEGER
*)¤tTime
);
254 return entry
->expiration
< currentTime
;
257 static __inline NDIS_STATUS
258 OvsDetectCtPacket(OvsFlowKey
*key
)
260 /* Currently we support only Unfragmented TCP packets */
261 switch (ntohs(key
->l2
.dlType
)) {
263 if (key
->ipKey
.nwFrag
!= OVS_FRAG_TYPE_NONE
) {
264 return NDIS_STATUS_NOT_SUPPORTED
;
266 if (key
->ipKey
.nwProto
== IPPROTO_TCP
267 || key
->ipKey
.nwProto
== IPPROTO_UDP
268 || key
->ipKey
.nwProto
== IPPROTO_ICMP
) {
269 return NDIS_STATUS_SUCCESS
;
271 return NDIS_STATUS_NOT_SUPPORTED
;
273 return NDIS_STATUS_NOT_SUPPORTED
;
276 return NDIS_STATUS_NOT_SUPPORTED
;
279 static __inline BOOLEAN
280 OvsCtKeyAreSame(OVS_CT_KEY ctxKey
, OVS_CT_KEY entryKey
)
282 return ((ctxKey
.src
.addr
.ipv4
== entryKey
.src
.addr
.ipv4
) &&
283 (ctxKey
.src
.addr
.ipv4_aligned
== entryKey
.src
.addr
.ipv4_aligned
) &&
284 (ctxKey
.src
.port
== entryKey
.src
.port
) &&
285 (ctxKey
.dst
.addr
.ipv4
== entryKey
.dst
.addr
.ipv4
) &&
286 (ctxKey
.dst
.addr
.ipv4_aligned
== entryKey
.dst
.addr
.ipv4_aligned
) &&
287 (ctxKey
.dst
.port
== entryKey
.dst
.port
) &&
288 (ctxKey
.dl_type
== entryKey
.dl_type
) &&
289 (ctxKey
.nw_proto
== entryKey
.nw_proto
) &&
290 (ctxKey
.zone
== entryKey
.zone
));
293 static __inline POVS_CT_ENTRY
294 OvsCtLookup(OvsConntrackKeyLookupCtx
*ctx
)
298 BOOLEAN reply
= FALSE
;
299 POVS_CT_ENTRY found
= NULL
;
301 LIST_FORALL(&ovsConntrackTable
[ctx
->hash
& CT_HASH_TABLE_MASK
], link
) {
302 entry
= CONTAINING_RECORD(link
, OVS_CT_ENTRY
, link
);
304 if (OvsCtKeyAreSame(ctx
->key
,entry
->key
)) {
310 if (OvsCtKeyAreSame(ctx
->key
,entry
->rev_key
)) {
318 if (OvsCtEntryExpired(found
)) {
329 static __inline UINT32
330 OvsExtractLookupCtxHash(OvsConntrackKeyLookupCtx
*ctx
)
332 UINT32 hsrc
, hdst
,hash
;
333 hsrc
= OvsJhashBytes((UINT32
*) &ctx
->key
.src
, sizeof(ctx
->key
.src
), 0);
334 hdst
= OvsJhashBytes((UINT32
*) &ctx
->key
.dst
, sizeof(ctx
->key
.dst
), 0);
335 hash
= hsrc
^ hdst
; /* TO identify reverse traffic */
336 return OvsJhashBytes((uint32_t *) &ctx
->key
.dst
+ 1,
337 ((uint32_t *) (&ctx
->key
+ 1) -
338 (uint32_t *) (&ctx
->key
.dst
+ 1)),
342 static __inline NDIS_STATUS
343 OvsCtSetupLookupCtx(OvsFlowKey
*flowKey
,
345 OvsConntrackKeyLookupCtx
*ctx
,
346 PNET_BUFFER_LIST curNbl
,
349 ctx
->key
.zone
= zone
;
350 ctx
->key
.dl_type
= flowKey
->l2
.dlType
;
351 ctx
->related
= FALSE
;
353 /* Extract L3 and L4*/
354 if (flowKey
->l2
.dlType
== htons(ETH_TYPE_IPV4
)) {
355 ctx
->key
.src
.addr
.ipv4
= flowKey
->ipKey
.nwSrc
;
356 ctx
->key
.dst
.addr
.ipv4
= flowKey
->ipKey
.nwDst
;
357 ctx
->key
.nw_proto
= flowKey
->ipKey
.nwProto
;
359 ctx
->key
.src
.port
= flowKey
->ipKey
.l4
.tpSrc
;
360 ctx
->key
.dst
.port
= flowKey
->ipKey
.l4
.tpDst
;
361 if (flowKey
->ipKey
.nwProto
== IPPROTO_ICMP
) {
364 icmp
= OvsGetIcmp(curNbl
, l4Offset
, &icmpStorage
);
366 ctx
->key
.src
.port
= ctx
->key
.dst
.port
= icmp
->fields
.echo
.id
;
368 /* Related bit is set when ICMP has an error */
369 /* XXX parse out the appropriate src and dst from inner pkt */
370 switch (icmp
->type
) {
371 case ICMP4_DEST_UNREACH
:
372 case ICMP4_TIME_EXCEEDED
:
373 case ICMP4_PARAM_PROB
:
374 case ICMP4_SOURCE_QUENCH
:
375 case ICMP4_REDIRECT
: {
380 ctx
->related
= FALSE
;
383 } else if (flowKey
->l2
.dlType
== htons(ETH_TYPE_IPV6
)) {
384 ctx
->key
.src
.addr
.ipv6
= flowKey
->ipv6Key
.ipv6Src
;
385 ctx
->key
.dst
.addr
.ipv6
= flowKey
->ipv6Key
.ipv6Dst
;
386 ctx
->key
.nw_proto
= flowKey
->ipv6Key
.nwProto
;
388 ctx
->key
.src
.port
= flowKey
->ipv6Key
.l4
.tpSrc
;
389 ctx
->key
.dst
.port
= flowKey
->ipv6Key
.l4
.tpDst
;
390 /* XXX Handle ICMPv6 errors*/
392 return NDIS_STATUS_INVALID_PACKET
;
395 ctx
->hash
= OvsExtractLookupCtxHash(ctx
);
396 return NDIS_STATUS_SUCCESS
;
400 *----------------------------------------------------------------------------
401 * OvsProcessConntrackEntry
402 * Check the TCP flags and set the ct_state of the entry
403 *----------------------------------------------------------------------------
405 static __inline POVS_CT_ENTRY
406 OvsProcessConntrackEntry(PNET_BUFFER_LIST curNbl
,
408 OvsConntrackKeyLookupCtx
*ctx
,
414 POVS_CT_ENTRY entry
= ctx
->entry
;
417 /* If an entry was found, update the state based on TCP flags */
419 state
|= OVS_CS_F_RELATED
;
421 state
= OVS_CS_F_REPLY_DIR
;
424 CT_UPDATE_RES result
;
425 result
= OvsCtUpdateEntry(entry
, curNbl
, key
->ipKey
.nwProto
,
426 l4Offset
, ctx
->reply
, currentTime
);
428 case CT_UPDATE_VALID
:
429 state
|= OVS_CS_F_ESTABLISHED
;
431 state
|= OVS_CS_F_REPLY_DIR
;
434 case CT_UPDATE_INVALID
:
435 state
|= OVS_CS_F_INVALID
;
438 //Delete and update the Conntrack
439 OvsCtEntryDelete(ctx
->entry
);
441 entry
= OvsCtEntryCreate(curNbl
, key
->ipKey
.nwProto
, l4Offset
,
442 ctx
, key
, commit
, currentTime
);
446 /* Copy mark and label from entry into flowKey. If actions specify
447 different mark and label, update the flowKey. */
449 OvsCtUpdateFlowKey(key
, state
, zone
, entry
->mark
, &entry
->labels
);
451 OvsCtUpdateFlowKey(key
, state
, zone
, 0, NULL
);
457 OvsConntrackSetMark(OvsFlowKey
*key
,
463 newMark
= value
| (entry
->mark
& ~(mask
));
464 if (entry
->mark
!= newMark
) {
465 entry
->mark
= newMark
;
466 key
->ct
.mark
= newMark
;
471 OvsConntrackSetLabels(OvsFlowKey
*key
,
473 struct ovs_key_ct_labels
*val
,
474 struct ovs_key_ct_labels
*mask
)
476 ovs_u128 v
, m
, pktMdLabel
= {0};
477 memcpy(&v
, val
, sizeof v
);
478 memcpy(&m
, mask
, sizeof m
);
480 pktMdLabel
.u64
.lo
= v
.u64
.lo
| (pktMdLabel
.u64
.lo
& ~(m
.u64
.lo
));
481 pktMdLabel
.u64
.hi
= v
.u64
.hi
| (pktMdLabel
.u64
.hi
& ~(m
.u64
.hi
));
483 NdisMoveMemory(&entry
->labels
, &pktMdLabel
,
484 sizeof(struct ovs_key_ct_labels
));
485 NdisMoveMemory(&key
->ct
.labels
, &pktMdLabel
,
486 sizeof(struct ovs_key_ct_labels
));
489 static __inline NDIS_STATUS
490 OvsCtExecute_(PNET_BUFFER_LIST curNbl
,
492 OVS_PACKET_HDR_INFO
*layers
,
498 NDIS_STATUS status
= NDIS_STATUS_SUCCESS
;
499 POVS_CT_ENTRY entry
= NULL
;
500 OvsConntrackKeyLookupCtx ctx
= { 0 };
501 LOCK_STATE_EX lockState
;
503 NdisGetCurrentSystemTime((LARGE_INTEGER
*) ¤tTime
);
505 /* Retrieve the Conntrack Key related fields from packet */
506 OvsCtSetupLookupCtx(key
, zone
, &ctx
, curNbl
, layers
->l4Offset
);
508 NdisAcquireRWLockWrite(ovsConntrackLockObj
, &lockState
, 0);
510 /* Lookup Conntrack entries for a matching entry */
511 entry
= OvsCtLookup(&ctx
);
514 /* If no matching entry was found, create one and add New state */
515 entry
= OvsCtEntryCreate(curNbl
, key
->ipKey
.nwProto
,
516 layers
->l4Offset
, &ctx
,
517 key
, commit
, currentTime
);
519 /* Process the entry and update CT flags */
520 entry
= OvsProcessConntrackEntry(curNbl
, layers
->l4Offset
, &ctx
, key
,
521 zone
, commit
, currentTime
);
525 OvsConntrackSetMark(key
, entry
, mark
->value
, mark
->mask
);
528 if (entry
&& labels
) {
529 OvsConntrackSetLabels(key
, entry
, &labels
->value
, &labels
->mask
);
532 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
538 *---------------------------------------------------------------------------
539 * OvsExecuteConntrackAction
540 * Executes Conntrack actions XXX - Add more
541 *---------------------------------------------------------------------------
544 OvsExecuteConntrackAction(PNET_BUFFER_LIST curNbl
,
545 OVS_PACKET_HDR_INFO
*layers
,
550 BOOLEAN commit
= FALSE
;
552 MD_MARK
*mark
= NULL
;
553 MD_LABELS
*labels
= NULL
;
556 status
= OvsDetectCtPacket(key
);
557 if (status
!= NDIS_STATUS_SUCCESS
) {
561 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_ZONE
);
563 zone
= NlAttrGetU16(ctAttr
);
565 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_COMMIT
);
569 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_MARK
);
571 mark
= NlAttrGet(ctAttr
);
573 ctAttr
= NlAttrFindNested(a
, OVS_CT_ATTR_LABELS
);
575 labels
= NlAttrGet(ctAttr
);
578 status
= OvsCtExecute_(curNbl
, key
, layers
,
579 commit
, zone
, mark
, labels
);
584 *----------------------------------------------------------------------------
585 * OvsConntrackEnrtyCleaner
586 * Runs periodically and cleans up the connection tracker
587 *----------------------------------------------------------------------------
590 ovsConntrackEntryCleaner(PVOID data
)
593 POVS_CT_THREAD_CTX context
= (POVS_CT_THREAD_CTX
)data
;
594 PLIST_ENTRY link
, next
;
596 BOOLEAN success
= TRUE
;
599 LOCK_STATE_EX lockState
;
600 NdisAcquireRWLockWrite(ovsConntrackLockObj
, &lockState
, 0);
602 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
606 /* Set the timeout for the thread and cleanup */
607 UINT64 currentTime
, threadSleepTimeout
;
608 NdisGetCurrentSystemTime((LARGE_INTEGER
*)¤tTime
);
609 threadSleepTimeout
= currentTime
+ CT_CLEANUP_INTERVAL
;
611 for (int i
= 0; i
< CT_HASH_TABLE_SIZE
; i
++) {
612 LIST_FORALL_SAFE(&ovsConntrackTable
[i
], link
, next
) {
613 entry
= CONTAINING_RECORD(link
, OVS_CT_ENTRY
, link
);
614 if (entry
->expiration
< currentTime
) {
615 OvsCtEntryDelete(entry
);
620 NdisReleaseRWLock(ovsConntrackLockObj
, &lockState
);
621 KeWaitForSingleObject(&context
->event
, Executive
, KernelMode
,
622 FALSE
, (LARGE_INTEGER
*)&threadSleepTimeout
);
625 PsTerminateSystemThread(STATUS_SUCCESS
);