]> git.proxmox.com Git - ovs.git/blob - datapath-windows/ovsext/Conntrack.c
b78ba263e363089164bee11f80749f226204393f
[ovs.git] / datapath-windows / ovsext / Conntrack.c
1 /*
2 * Copyright (c) 2015, 2016 VMware, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifdef OVS_DBG_MOD
18 #undef OVS_DBG_MOD
19 #endif
20 #define OVS_DBG_MOD OVS_DBG_CONTRK
21
22 #include "Conntrack.h"
23 #include "Jhash.h"
24 #include "PacketParser.h"
25 #include "Debug.h"
26
27 typedef struct _OVS_CT_THREAD_CTX {
28 KEVENT event;
29 PVOID threadObject;
30 UINT32 exit;
31 } OVS_CT_THREAD_CTX, *POVS_CT_THREAD_CTX;
32
33 KSTART_ROUTINE ovsConntrackEntryCleaner;
34 static PLIST_ENTRY ovsConntrackTable;
35 static OVS_CT_THREAD_CTX ctThreadCtx;
36 static PNDIS_RW_LOCK_EX ovsConntrackLockObj;
37
38 /*
39 *----------------------------------------------------------------------------
40 * OvsInitConntrack
41 * Initialize the components used by Connection Tracking
42 *----------------------------------------------------------------------------
43 */
44 NTSTATUS
45 OvsInitConntrack(POVS_SWITCH_CONTEXT context)
46 {
47 NTSTATUS status;
48 HANDLE threadHandle = NULL;
49
50 /* Init the sync-lock */
51 ovsConntrackLockObj = NdisAllocateRWLock(context->NdisFilterHandle);
52 if (ovsConntrackLockObj == NULL) {
53 return STATUS_INSUFFICIENT_RESOURCES;
54 }
55
56 /* Init the Hash Buffer */
57 ovsConntrackTable = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY)
58 * CT_HASH_TABLE_SIZE,
59 OVS_CT_POOL_TAG);
60 if (ovsConntrackTable == NULL) {
61 NdisFreeRWLock(ovsConntrackLockObj);
62 ovsConntrackLockObj = NULL;
63 return STATUS_INSUFFICIENT_RESOURCES;
64 }
65
66 for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) {
67 InitializeListHead(&ovsConntrackTable[i]);
68 }
69
70 /* Init CT Cleaner Thread */
71 KeInitializeEvent(&ctThreadCtx.event, NotificationEvent, FALSE);
72 status = PsCreateSystemThread(&threadHandle, SYNCHRONIZE, NULL, NULL,
73 NULL, ovsConntrackEntryCleaner,
74 &ctThreadCtx);
75
76 if (status != STATUS_SUCCESS) {
77 NdisFreeRWLock(ovsConntrackLockObj);
78 ovsConntrackLockObj = NULL;
79
80 OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG);
81 ovsConntrackTable = NULL;
82
83 return status;
84 }
85
86 ObReferenceObjectByHandle(threadHandle, SYNCHRONIZE, NULL, KernelMode,
87 &ctThreadCtx.threadObject, NULL);
88 ZwClose(threadHandle);
89 threadHandle = NULL;
90 return STATUS_SUCCESS;
91 }
92
93 /*
94 *----------------------------------------------------------------------------
95 * OvsCleanupConntrack
96 * Cleanup memory and thread that were spawned for Connection tracking
97 *----------------------------------------------------------------------------
98 */
99 VOID
100 OvsCleanupConntrack(VOID)
101 {
102 LOCK_STATE_EX lockState;
103 NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0);
104 ctThreadCtx.exit = 1;
105 KeSetEvent(&ctThreadCtx.event, 0, FALSE);
106 NdisReleaseRWLock(ovsConntrackLockObj, &lockState);
107
108 KeWaitForSingleObject(ctThreadCtx.threadObject, Executive,
109 KernelMode, FALSE, NULL);
110 ObDereferenceObject(ctThreadCtx.threadObject);
111
112 if (ovsConntrackTable) {
113 OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG);
114 ovsConntrackTable = NULL;
115 }
116
117 NdisFreeRWLock(ovsConntrackLockObj);
118 ovsConntrackLockObj = NULL;
119 }
120
121 static __inline VOID
122 OvsCtKeyReverse(OVS_CT_KEY *key)
123 {
124 struct ct_endpoint tmp;
125 tmp = key->src;
126 key->src = key->dst;
127 key->dst = tmp;
128 }
129
130 static __inline VOID
131 OvsCtUpdateFlowKey(struct OvsFlowKey *key,
132 UINT32 state,
133 UINT16 zone,
134 UINT32 mark,
135 struct ovs_key_ct_labels *labels)
136 {
137 key->ct.state = state | OVS_CS_F_TRACKED;
138 key->ct.zone = zone;
139 key->ct.mark = mark;
140 if (labels) {
141 NdisMoveMemory(&key->ct.labels, labels,
142 sizeof(struct ovs_key_ct_labels));
143 } else {
144 memset(&key->ct.labels, 0,
145 sizeof(struct ovs_key_ct_labels));
146 }
147 }
148
149 static __inline VOID
150 OvsCtAddEntry(POVS_CT_ENTRY entry, OvsConntrackKeyLookupCtx *ctx)
151 {
152 NdisMoveMemory(&entry->key, &ctx->key, sizeof (OVS_CT_KEY));
153 NdisMoveMemory(&entry->rev_key, &ctx->key, sizeof (OVS_CT_KEY));
154 OvsCtKeyReverse(&entry->rev_key);
155 InsertHeadList(&ovsConntrackTable[ctx->hash & CT_HASH_TABLE_MASK],
156 &entry->link);
157 }
158
159 static __inline POVS_CT_ENTRY
160 OvsCtEntryCreate(PNET_BUFFER_LIST curNbl,
161 UINT8 ipProto,
162 UINT32 l4Offset,
163 OvsConntrackKeyLookupCtx *ctx,
164 OvsFlowKey *key,
165 BOOLEAN commit,
166 UINT64 currentTime)
167 {
168 POVS_CT_ENTRY entry = NULL;
169 UINT32 state = 0;
170 switch (ipProto)
171 {
172 case IPPROTO_TCP:
173 {
174 TCPHdr tcpStorage;
175 const TCPHdr *tcp;
176 tcp = OvsGetTcp(curNbl, l4Offset, &tcpStorage);
177 if (!OvsConntrackValidateTcpPacket(tcp)) {
178 goto invalid;
179 }
180
181 state |= OVS_CS_F_NEW;
182 if (commit) {
183 entry = OvsConntrackCreateTcpEntry(tcp, curNbl, currentTime);
184 OvsCtAddEntry(entry, ctx);
185 }
186
187 OvsCtUpdateFlowKey(key, state, ctx->key.zone, 0, NULL);
188 return entry;
189 }
190 case IPPROTO_ICMP:
191 case IPPROTO_UDP:
192 state |= OVS_CS_F_NEW;
193 if (commit) {
194 entry = OvsConntrackCreateOtherEntry(currentTime);
195 OvsCtAddEntry(entry, ctx);
196 }
197
198 OvsCtUpdateFlowKey(key, state, ctx->key.zone, 0, NULL);
199 return entry;
200 default:
201 goto invalid;
202 }
203
204 invalid:
205 state |= OVS_CS_F_INVALID;
206 OvsCtUpdateFlowKey(key, state, ctx->key.zone, 0, NULL);
207 return entry;
208 }
209
210 static enum CT_UPDATE_RES
211 OvsCtUpdateEntry(OVS_CT_ENTRY* entry,
212 PNET_BUFFER_LIST nbl,
213 UINT8 ipProto,
214 UINT32 l4Offset,
215 BOOLEAN reply,
216 UINT64 now)
217 {
218 switch (ipProto)
219 {
220 case IPPROTO_TCP:
221 {
222 TCPHdr tcpStorage;
223 const TCPHdr *tcp;
224 tcp = OvsGetTcp(nbl, l4Offset, &tcpStorage);
225 if (!tcp) {
226 return CT_UPDATE_INVALID;
227 }
228 return OvsConntrackUpdateTcpEntry(entry, tcp, nbl, reply, now);
229 }
230 case IPPROTO_ICMP:
231 case IPPROTO_UDP:
232 return OvsConntrackUpdateOtherEntry(entry, reply, now);
233 default:
234 return CT_UPDATE_INVALID;
235 }
236 }
237
238 static __inline VOID
239 OvsCtEntryDelete(POVS_CT_ENTRY entry)
240 {
241 RemoveEntryList(&entry->link);
242 OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG);
243 }
244
245 static __inline BOOLEAN
246 OvsCtEntryExpired(POVS_CT_ENTRY entry)
247 {
248 if (entry == NULL) {
249 return TRUE;
250 }
251
252 UINT64 currentTime;
253 NdisGetCurrentSystemTime((LARGE_INTEGER *)&currentTime);
254 return entry->expiration < currentTime;
255 }
256
257 static __inline NDIS_STATUS
258 OvsDetectCtPacket(OvsFlowKey *key)
259 {
260 /* Currently we support only Unfragmented TCP packets */
261 switch (ntohs(key->l2.dlType)) {
262 case ETH_TYPE_IPV4:
263 if (key->ipKey.nwFrag != OVS_FRAG_TYPE_NONE) {
264 return NDIS_STATUS_NOT_SUPPORTED;
265 }
266 if (key->ipKey.nwProto == IPPROTO_TCP
267 || key->ipKey.nwProto == IPPROTO_UDP
268 || key->ipKey.nwProto == IPPROTO_ICMP) {
269 return NDIS_STATUS_SUCCESS;
270 }
271 return NDIS_STATUS_NOT_SUPPORTED;
272 case ETH_TYPE_IPV6:
273 return NDIS_STATUS_NOT_SUPPORTED;
274 }
275
276 return NDIS_STATUS_NOT_SUPPORTED;
277 }
278
279 static __inline BOOLEAN
280 OvsCtKeyAreSame(OVS_CT_KEY ctxKey, OVS_CT_KEY entryKey)
281 {
282 return ((ctxKey.src.addr.ipv4 == entryKey.src.addr.ipv4) &&
283 (ctxKey.src.addr.ipv4_aligned == entryKey.src.addr.ipv4_aligned) &&
284 (ctxKey.src.port == entryKey.src.port) &&
285 (ctxKey.dst.addr.ipv4 == entryKey.dst.addr.ipv4) &&
286 (ctxKey.dst.addr.ipv4_aligned == entryKey.dst.addr.ipv4_aligned) &&
287 (ctxKey.dst.port == entryKey.dst.port) &&
288 (ctxKey.dl_type == entryKey.dl_type) &&
289 (ctxKey.nw_proto == entryKey.nw_proto) &&
290 (ctxKey.zone == entryKey.zone));
291 }
292
293 static __inline POVS_CT_ENTRY
294 OvsCtLookup(OvsConntrackKeyLookupCtx *ctx)
295 {
296 PLIST_ENTRY link;
297 POVS_CT_ENTRY entry;
298 BOOLEAN reply = FALSE;
299 POVS_CT_ENTRY found = NULL;
300
301 LIST_FORALL(&ovsConntrackTable[ctx->hash & CT_HASH_TABLE_MASK], link) {
302 entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link);
303
304 if (OvsCtKeyAreSame(ctx->key,entry->key)) {
305 found = entry;
306 reply = FALSE;
307 break;
308 }
309
310 if (OvsCtKeyAreSame(ctx->key,entry->rev_key)) {
311 found = entry;
312 reply = TRUE;
313 break;
314 }
315 }
316
317 if (found) {
318 if (OvsCtEntryExpired(found)) {
319 found = NULL;
320 } else {
321 ctx->reply = reply;
322 }
323 }
324
325 ctx->entry = found;
326 return found;
327 }
328
329 static __inline UINT32
330 OvsExtractLookupCtxHash(OvsConntrackKeyLookupCtx *ctx)
331 {
332 UINT32 hsrc, hdst,hash;
333 hsrc = OvsJhashBytes((UINT32*) &ctx->key.src, sizeof(ctx->key.src), 0);
334 hdst = OvsJhashBytes((UINT32*) &ctx->key.dst, sizeof(ctx->key.dst), 0);
335 hash = hsrc ^ hdst; /* TO identify reverse traffic */
336 return OvsJhashBytes((uint32_t *) &ctx->key.dst + 1,
337 ((uint32_t *) (&ctx->key + 1) -
338 (uint32_t *) (&ctx->key.dst + 1)),
339 hash);
340 }
341
342 static __inline NDIS_STATUS
343 OvsCtSetupLookupCtx(OvsFlowKey *flowKey,
344 UINT16 zone,
345 OvsConntrackKeyLookupCtx *ctx,
346 PNET_BUFFER_LIST curNbl,
347 UINT32 l4Offset)
348 {
349 ctx->key.zone = zone;
350 ctx->key.dl_type = flowKey->l2.dlType;
351 ctx->related = FALSE;
352
353 /* Extract L3 and L4*/
354 if (flowKey->l2.dlType == htons(ETH_TYPE_IPV4)) {
355 ctx->key.src.addr.ipv4 = flowKey->ipKey.nwSrc;
356 ctx->key.dst.addr.ipv4 = flowKey->ipKey.nwDst;
357 ctx->key.nw_proto = flowKey->ipKey.nwProto;
358
359 ctx->key.src.port = flowKey->ipKey.l4.tpSrc;
360 ctx->key.dst.port = flowKey->ipKey.l4.tpDst;
361 if (flowKey->ipKey.nwProto == IPPROTO_ICMP) {
362 ICMPHdr icmpStorage;
363 const ICMPHdr *icmp;
364 icmp = OvsGetIcmp(curNbl, l4Offset, &icmpStorage);
365 ASSERT(icmp);
366 ctx->key.src.port = ctx->key.dst.port = icmp->fields.echo.id;
367
368 /* Related bit is set when ICMP has an error */
369 /* XXX parse out the appropriate src and dst from inner pkt */
370 switch (icmp->type) {
371 case ICMP4_DEST_UNREACH:
372 case ICMP4_TIME_EXCEEDED:
373 case ICMP4_PARAM_PROB:
374 case ICMP4_SOURCE_QUENCH:
375 case ICMP4_REDIRECT: {
376 ctx->related = TRUE;
377 break;
378 }
379 default:
380 ctx->related = FALSE;
381 }
382 }
383 } else if (flowKey->l2.dlType == htons(ETH_TYPE_IPV6)) {
384 ctx->key.src.addr.ipv6 = flowKey->ipv6Key.ipv6Src;
385 ctx->key.dst.addr.ipv6 = flowKey->ipv6Key.ipv6Dst;
386 ctx->key.nw_proto = flowKey->ipv6Key.nwProto;
387
388 ctx->key.src.port = flowKey->ipv6Key.l4.tpSrc;
389 ctx->key.dst.port = flowKey->ipv6Key.l4.tpDst;
390 /* XXX Handle ICMPv6 errors*/
391 } else {
392 return NDIS_STATUS_INVALID_PACKET;
393 }
394
395 ctx->hash = OvsExtractLookupCtxHash(ctx);
396 return NDIS_STATUS_SUCCESS;
397 }
398
399 /*
400 *----------------------------------------------------------------------------
401 * OvsProcessConntrackEntry
402 * Check the TCP flags and set the ct_state of the entry
403 *----------------------------------------------------------------------------
404 */
405 static __inline POVS_CT_ENTRY
406 OvsProcessConntrackEntry(PNET_BUFFER_LIST curNbl,
407 UINT32 l4Offset,
408 OvsConntrackKeyLookupCtx *ctx,
409 OvsFlowKey *key,
410 UINT16 zone,
411 BOOLEAN commit,
412 UINT64 currentTime)
413 {
414 POVS_CT_ENTRY entry = ctx->entry;
415 UINT32 state = 0;
416
417 /* If an entry was found, update the state based on TCP flags */
418 if (ctx->related) {
419 state |= OVS_CS_F_RELATED;
420 if (ctx->reply) {
421 state = OVS_CS_F_REPLY_DIR;
422 }
423 } else {
424 CT_UPDATE_RES result;
425 result = OvsCtUpdateEntry(entry, curNbl, key->ipKey.nwProto,
426 l4Offset, ctx->reply, currentTime);
427 switch (result) {
428 case CT_UPDATE_VALID:
429 state |= OVS_CS_F_ESTABLISHED;
430 if (ctx->reply) {
431 state |= OVS_CS_F_REPLY_DIR;
432 }
433 break;
434 case CT_UPDATE_INVALID:
435 state |= OVS_CS_F_INVALID;
436 break;
437 case CT_UPDATE_NEW:
438 //Delete and update the Conntrack
439 OvsCtEntryDelete(ctx->entry);
440 ctx->entry = NULL;
441 entry = OvsCtEntryCreate(curNbl, key->ipKey.nwProto, l4Offset,
442 ctx, key, commit, currentTime);
443 break;
444 }
445 }
446 /* Copy mark and label from entry into flowKey. If actions specify
447 different mark and label, update the flowKey. */
448 if (entry != NULL) {
449 OvsCtUpdateFlowKey(key, state, zone, entry->mark, &entry->labels);
450 } else {
451 OvsCtUpdateFlowKey(key, state, zone, 0, NULL);
452 }
453 return entry;
454 }
455
456 static __inline VOID
457 OvsConntrackSetMark(OvsFlowKey *key,
458 POVS_CT_ENTRY entry,
459 UINT32 value,
460 UINT32 mask)
461 {
462 UINT32 newMark;
463 newMark = value | (entry->mark & ~(mask));
464 if (entry->mark != newMark) {
465 entry->mark = newMark;
466 key->ct.mark = newMark;
467 }
468 }
469
470 static __inline void
471 OvsConntrackSetLabels(OvsFlowKey *key,
472 POVS_CT_ENTRY entry,
473 struct ovs_key_ct_labels *val,
474 struct ovs_key_ct_labels *mask)
475 {
476 ovs_u128 v, m, pktMdLabel = {0};
477 memcpy(&v, val, sizeof v);
478 memcpy(&m, mask, sizeof m);
479
480 pktMdLabel.u64.lo = v.u64.lo | (pktMdLabel.u64.lo & ~(m.u64.lo));
481 pktMdLabel.u64.hi = v.u64.hi | (pktMdLabel.u64.hi & ~(m.u64.hi));
482
483 NdisMoveMemory(&entry->labels, &pktMdLabel,
484 sizeof(struct ovs_key_ct_labels));
485 NdisMoveMemory(&key->ct.labels, &pktMdLabel,
486 sizeof(struct ovs_key_ct_labels));
487 }
488
489 static __inline NDIS_STATUS
490 OvsCtExecute_(PNET_BUFFER_LIST curNbl,
491 OvsFlowKey *key,
492 OVS_PACKET_HDR_INFO *layers,
493 BOOLEAN commit,
494 UINT16 zone,
495 MD_MARK *mark,
496 MD_LABELS *labels)
497 {
498 NDIS_STATUS status = NDIS_STATUS_SUCCESS;
499 POVS_CT_ENTRY entry = NULL;
500 OvsConntrackKeyLookupCtx ctx = { 0 };
501 LOCK_STATE_EX lockState;
502 UINT64 currentTime;
503 NdisGetCurrentSystemTime((LARGE_INTEGER *) &currentTime);
504
505 /* Retrieve the Conntrack Key related fields from packet */
506 OvsCtSetupLookupCtx(key, zone, &ctx, curNbl, layers->l4Offset);
507
508 NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0);
509
510 /* Lookup Conntrack entries for a matching entry */
511 entry = OvsCtLookup(&ctx);
512
513 if (!entry) {
514 /* If no matching entry was found, create one and add New state */
515 entry = OvsCtEntryCreate(curNbl, key->ipKey.nwProto,
516 layers->l4Offset, &ctx,
517 key, commit, currentTime);
518 } else {
519 /* Process the entry and update CT flags */
520 entry = OvsProcessConntrackEntry(curNbl, layers->l4Offset, &ctx, key,
521 zone, commit, currentTime);
522 }
523
524 if (entry && mark) {
525 OvsConntrackSetMark(key, entry, mark->value, mark->mask);
526 }
527
528 if (entry && labels) {
529 OvsConntrackSetLabels(key, entry, &labels->value, &labels->mask);
530 }
531
532 NdisReleaseRWLock(ovsConntrackLockObj, &lockState);
533
534 return status;
535 }
536
537 /*
538 *---------------------------------------------------------------------------
539 * OvsExecuteConntrackAction
540 * Executes Conntrack actions XXX - Add more
541 *---------------------------------------------------------------------------
542 */
543 NDIS_STATUS
544 OvsExecuteConntrackAction(PNET_BUFFER_LIST curNbl,
545 OVS_PACKET_HDR_INFO *layers,
546 OvsFlowKey *key,
547 const PNL_ATTR a)
548 {
549 PNL_ATTR ctAttr;
550 BOOLEAN commit = FALSE;
551 UINT16 zone = 0;
552 MD_MARK *mark = NULL;
553 MD_LABELS *labels = NULL;
554 NDIS_STATUS status;
555
556 status = OvsDetectCtPacket(key);
557 if (status != NDIS_STATUS_SUCCESS) {
558 return status;
559 }
560
561 ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_ZONE);
562 if (ctAttr) {
563 zone = NlAttrGetU16(ctAttr);
564 }
565 ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_COMMIT);
566 if (ctAttr) {
567 commit = TRUE;
568 }
569 ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_MARK);
570 if (ctAttr) {
571 mark = NlAttrGet(ctAttr);
572 }
573 ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_LABELS);
574 if (ctAttr) {
575 labels = NlAttrGet(ctAttr);
576 }
577
578 status = OvsCtExecute_(curNbl, key, layers,
579 commit, zone, mark, labels);
580 return status;
581 }
582
583 /*
584 *----------------------------------------------------------------------------
585 * OvsConntrackEnrtyCleaner
586 * Runs periodically and cleans up the connection tracker
587 *----------------------------------------------------------------------------
588 */
589 VOID
590 ovsConntrackEntryCleaner(PVOID data)
591 {
592
593 POVS_CT_THREAD_CTX context = (POVS_CT_THREAD_CTX)data;
594 PLIST_ENTRY link, next;
595 POVS_CT_ENTRY entry;
596 BOOLEAN success = TRUE;
597
598 while (success) {
599 LOCK_STATE_EX lockState;
600 NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0);
601 if (context->exit) {
602 NdisReleaseRWLock(ovsConntrackLockObj, &lockState);
603 break;
604 }
605
606 /* Set the timeout for the thread and cleanup */
607 UINT64 currentTime, threadSleepTimeout;
608 NdisGetCurrentSystemTime((LARGE_INTEGER *)&currentTime);
609 threadSleepTimeout = currentTime + CT_CLEANUP_INTERVAL;
610
611 for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) {
612 LIST_FORALL_SAFE(&ovsConntrackTable[i], link, next) {
613 entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link);
614 if (entry->expiration < currentTime) {
615 OvsCtEntryDelete(entry);
616 }
617 }
618 }
619
620 NdisReleaseRWLock(ovsConntrackLockObj, &lockState);
621 KeWaitForSingleObject(&context->event, Executive, KernelMode,
622 FALSE, (LARGE_INTEGER *)&threadSleepTimeout);
623 }
624
625 PsTerminateSystemThread(STATUS_SUCCESS);
626 }