流表查询是datapath报文处理过程中,最为关键的一个步骤,一个skb报文进入如何能够快速地进行匹配流表? 本篇分析ovs是如何查询流表的。
1、ovs_flow_tbl_lookup_stats函数
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, //由ovs_flow_key_extract函数根据skb生成 u32 skb_hash,<span style="white-space:pre"> </span> //skb中携带的信息 u32 *n_mask_hit) { struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); //得到table实例 struct mask_cache_entry *entries, *ce; struct sw_flow *flow; u32 hash; int seg; *n_mask_hit = 0; if (unlikely(!skb_hash)) { //如果报文没有hash值,则mask_index为0,全遍历所有的mask。 u32 mask_index = 0; return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); } /* Pre and post recirulation flows usually have the same skb_hash * value. To avoid hash collisions, rehash the 'skb_hash' with * 'recirc_id'. */ if (key->recirc_id) skb_hash = jhash_1word(skb_hash, key->recirc_id); ce = NULL; hash = skb_hash; entries = this_cpu_ptr(tbl->mask_cache); /* Find the cache entry 'ce' to operate on. */ for (seg = 0; seg < MC_HASH_SEGS; seg++) { //32位的hash值被分成4段,每段8字节,作为cache的索引 int index = hash & (MC_HASH_ENTRIES - 1); struct mask_cache_entry *e; e = &entries[index]; //entry最大为256项 if (e->skb_hash == skb_hash) { //如果在cache entry找到报文hash相同项,则根据该entry指定的mask查表 flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &e->mask_index); if (!flow) e->skb_hash = 0; return flow; } if (!ce || e->skb_hash < ce->skb_hash) ce = e; /* A better replacement cache candidate. */ hash >>= MC_HASH_SHIFT; } /* Cache miss, do full lookup. */ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); //没有命中,ce作为新的cache项,将被刷新,下一次可以直接命中 if (flow) ce->skb_hash = skb_hash; return flow; }2、flow_lookup函数
</pre><pre name="code" class="cpp">static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, const struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, u32 *index) { struct sw_flow_mask *mask; struct sw_flow *flow; int i; if (*index < ma->max) { //如果index的值小于mask的entry数量,说明index是有效值,基于该值获取sw_flow_mask值 mask = rcu_dereference_ovsl(ma->masks[*index]); if (mask) { flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) return flow; } } for (i = 0; i < ma->max; i++) { if (i == *index) //前面已查询过,所以跳过该mask continue; mask = rcu_dereference_ovsl(ma->masks[i]); if (!mask) continue; flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { /* Found */ *index = i; //更新index指向的值,下次可以直接命中;此处说明cache没有命中,下一次可以直接命中 return flow; } } return NULL; }3、masked_flow_lookup函数
static struct sw_flow *masked_flow_lookup(struct table_instance *ti, const struct sw_flow_key *unmasked, const struct sw_flow_mask *mask, u32 *n_mask_hit) { struct sw_flow *flow; struct hlist_head *head; u32 hash; struct sw_flow_key masked_key; ovs_flow_mask_key(&masked_key, unmasked, false, mask); //根据mask,计算masked后的key,用以支持通配符 hash = flow_hash(&masked_key, &mask->range); //根据masked key和mask.range 计算hash值 head = find_bucket(ti, hash); //根据hash值,找到sw_flow的链表头 (*n_mask_hit)++; hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { //遍历链表 if (flow->mask == mask && flow->flow_table.hash == hash && //mask相同、hash相同并且key相同,则匹配到流表 flow_cmp_masked_key(flow, &masked_key, &mask->range)) return flow; } return NULL; }ovs_flow_mask_key 函数
void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, bool full, const struct sw_flow_mask *mask) { int start = full ? 0 : mask->range.start; int len = full ? sizeof *dst : range_n_bytes(&mask->range); const long *m = (const long *)((const u8 *)&mask->key + start); const long *s = (const long *)((const u8 *)src + start); long *d = (long *)((u8 *)dst + start); int i; /* If 'full' is true then all of 'dst' is fully initialized. Otherwise, * if 'full' is false the memory outside of the 'mask->range' is left * uninitialized. This can be used as an optimization when further * operations on 'dst' only use contents within 'mask->range'. */ for (i = 0; i < len; i += sizeof(long)) *d++ = *s++ & *m++; //目标key = 源key & mask, 起始位置和长度由mask的range成员对象指定 }find_bucket 函数
static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) { hash = jhash_1word(hash, ti->hash_seed); return flex_array_get(ti->buckets, (hash & (ti->n_buckets - 1))); //hash的低N位作为index }