l2t.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. /*
  2. * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
  3. * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
  4. *
  5. * This software is available to you under a choice of one of two
  6. * licenses. You may choose to be licensed under the terms of the GNU
  7. * General Public License (GPL) Version 2, available from the file
  8. * COPYING in the main directory of this source tree, or the
  9. * OpenIB.org BSD license below:
  10. *
  11. * Redistribution and use in source and binary forms, with or
  12. * without modification, are permitted provided that the following
  13. * conditions are met:
  14. *
  15. * - Redistributions of source code must retain the above
  16. * copyright notice, this list of conditions and the following
  17. * disclaimer.
  18. *
  19. * - Redistributions in binary form must reproduce the above
  20. * copyright notice, this list of conditions and the following
  21. * disclaimer in the documentation and/or other materials
  22. * provided with the distribution.
  23. *
  24. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28. * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29. * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30. * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31. * SOFTWARE.
  32. */
  33. #include <linux/skbuff.h>
  34. #include <linux/netdevice.h>
  35. #include <linux/if.h>
  36. #include <linux/if_vlan.h>
  37. #include <linux/jhash.h>
  38. #include <net/neighbour.h>
  39. #include "common.h"
  40. #include "t3cdev.h"
  41. #include "cxgb3_defs.h"
  42. #include "l2t.h"
  43. #include "t3_cpl.h"
  44. #include "firmware_exports.h"
  45. #define VLAN_NONE 0xfff
  46. /*
  47. * Module locking notes: There is a RW lock protecting the L2 table as a
  48. * whole plus a spinlock per L2T entry. Entry lookups and allocations happen
  49. * under the protection of the table lock, individual entry changes happen
  50. * while holding that entry's spinlock. The table lock nests outside the
  51. * entry locks. Allocations of new entries take the table lock as writers so
  52. * no other lookups can happen while allocating new entries. Entry updates
  53. * take the table lock as readers so multiple entries can be updated in
  54. * parallel. An L2T entry can be dropped by decrementing its reference count
  55. * and therefore can happen in parallel with entry allocation but no entry
  56. * can change state or increment its ref count during allocation as both of
  57. * these perform lookups.
  58. */
  59. static inline unsigned int vlan_prio(const struct l2t_entry *e)
  60. {
  61. return e->vlan >> 13;
  62. }
  63. static inline unsigned int arp_hash(u32 key, int ifindex,
  64. const struct l2t_data *d)
  65. {
  66. return jhash_2words(key, ifindex, 0) & (d->nentries - 1);
  67. }
  68. static inline void neigh_replace(struct l2t_entry *e, struct neighbour *n)
  69. {
  70. neigh_hold(n);
  71. if (e->neigh)
  72. neigh_release(e->neigh);
  73. e->neigh = n;
  74. }
  75. /*
  76. * Set up an L2T entry and send any packets waiting in the arp queue. The
  77. * supplied skb is used for the CPL_L2T_WRITE_REQ. Must be called with the
  78. * entry locked.
  79. */
  80. static int setup_l2e_send_pending(struct t3cdev *dev, struct sk_buff *skb,
  81. struct l2t_entry *e)
  82. {
  83. struct cpl_l2t_write_req *req;
  84. if (!skb) {
  85. skb = alloc_skb(sizeof(*req), GFP_ATOMIC);
  86. if (!skb)
  87. return -ENOMEM;
  88. }
  89. req = (struct cpl_l2t_write_req *)__skb_put(skb, sizeof(*req));
  90. req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
  91. OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx));
  92. req->params = htonl(V_L2T_W_IDX(e->idx) | V_L2T_W_IFF(e->smt_idx) |
  93. V_L2T_W_VLAN(e->vlan & VLAN_VID_MASK) |
  94. V_L2T_W_PRIO(vlan_prio(e)));
  95. memcpy(e->dmac, e->neigh->ha, sizeof(e->dmac));
  96. memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
  97. skb->priority = CPL_PRIORITY_CONTROL;
  98. cxgb3_ofld_send(dev, skb);
  99. while (e->arpq_head) {
  100. skb = e->arpq_head;
  101. e->arpq_head = skb->next;
  102. skb->next = NULL;
  103. cxgb3_ofld_send(dev, skb);
  104. }
  105. e->arpq_tail = NULL;
  106. e->state = L2T_STATE_VALID;
  107. return 0;
  108. }
  109. /*
  110. * Add a packet to the an L2T entry's queue of packets awaiting resolution.
  111. * Must be called with the entry's lock held.
  112. */
  113. static inline void arpq_enqueue(struct l2t_entry *e, struct sk_buff *skb)
  114. {
  115. skb->next = NULL;
  116. if (e->arpq_head)
  117. e->arpq_tail->next = skb;
  118. else
  119. e->arpq_head = skb;
  120. e->arpq_tail = skb;
  121. }
  122. int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb,
  123. struct l2t_entry *e)
  124. {
  125. again:
  126. switch (e->state) {
  127. case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
  128. neigh_event_send(e->neigh, NULL);
  129. spin_lock_bh(&e->lock);
  130. if (e->state == L2T_STATE_STALE)
  131. e->state = L2T_STATE_VALID;
  132. spin_unlock_bh(&e->lock);
  133. case L2T_STATE_VALID: /* fast-path, send the packet on */
  134. return cxgb3_ofld_send(dev, skb);
  135. case L2T_STATE_RESOLVING:
  136. spin_lock_bh(&e->lock);
  137. if (e->state != L2T_STATE_RESOLVING) {
  138. /* ARP already completed */
  139. spin_unlock_bh(&e->lock);
  140. goto again;
  141. }
  142. arpq_enqueue(e, skb);
  143. spin_unlock_bh(&e->lock);
  144. /*
  145. * Only the first packet added to the arpq should kick off
  146. * resolution. However, because the alloc_skb below can fail,
  147. * we allow each packet added to the arpq to retry resolution
  148. * as a way of recovering from transient memory exhaustion.
  149. * A better way would be to use a work request to retry L2T
  150. * entries when there's no memory.
  151. */
  152. if (!neigh_event_send(e->neigh, NULL)) {
  153. skb = alloc_skb(sizeof(struct cpl_l2t_write_req),
  154. GFP_ATOMIC);
  155. if (!skb)
  156. break;
  157. spin_lock_bh(&e->lock);
  158. if (e->arpq_head)
  159. setup_l2e_send_pending(dev, skb, e);
  160. else /* we lost the race */
  161. __kfree_skb(skb);
  162. spin_unlock_bh(&e->lock);
  163. }
  164. }
  165. return 0;
  166. }
  167. EXPORT_SYMBOL(t3_l2t_send_slow);
  168. void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e)
  169. {
  170. again:
  171. switch (e->state) {
  172. case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
  173. neigh_event_send(e->neigh, NULL);
  174. spin_lock_bh(&e->lock);
  175. if (e->state == L2T_STATE_STALE) {
  176. e->state = L2T_STATE_VALID;
  177. }
  178. spin_unlock_bh(&e->lock);
  179. return;
  180. case L2T_STATE_VALID: /* fast-path, send the packet on */
  181. return;
  182. case L2T_STATE_RESOLVING:
  183. spin_lock_bh(&e->lock);
  184. if (e->state != L2T_STATE_RESOLVING) {
  185. /* ARP already completed */
  186. spin_unlock_bh(&e->lock);
  187. goto again;
  188. }
  189. spin_unlock_bh(&e->lock);
  190. /*
  191. * Only the first packet added to the arpq should kick off
  192. * resolution. However, because the alloc_skb below can fail,
  193. * we allow each packet added to the arpq to retry resolution
  194. * as a way of recovering from transient memory exhaustion.
  195. * A better way would be to use a work request to retry L2T
  196. * entries when there's no memory.
  197. */
  198. neigh_event_send(e->neigh, NULL);
  199. }
  200. return;
  201. }
  202. EXPORT_SYMBOL(t3_l2t_send_event);
  203. /*
  204. * Allocate a free L2T entry. Must be called with l2t_data.lock held.
  205. */
  206. static struct l2t_entry *alloc_l2e(struct l2t_data *d)
  207. {
  208. struct l2t_entry *end, *e, **p;
  209. if (!atomic_read(&d->nfree))
  210. return NULL;
  211. /* there's definitely a free entry */
  212. for (e = d->rover, end = &d->l2tab[d->nentries]; e != end; ++e)
  213. if (atomic_read(&e->refcnt) == 0)
  214. goto found;
  215. for (e = &d->l2tab[1]; atomic_read(&e->refcnt); ++e) ;
  216. found:
  217. d->rover = e + 1;
  218. atomic_dec(&d->nfree);
  219. /*
  220. * The entry we found may be an inactive entry that is
  221. * presently in the hash table. We need to remove it.
  222. */
  223. if (e->state != L2T_STATE_UNUSED) {
  224. int hash = arp_hash(e->addr, e->ifindex, d);
  225. for (p = &d->l2tab[hash].first; *p; p = &(*p)->next)
  226. if (*p == e) {
  227. *p = e->next;
  228. break;
  229. }
  230. e->state = L2T_STATE_UNUSED;
  231. }
  232. return e;
  233. }
  234. /*
  235. * Called when an L2T entry has no more users. The entry is left in the hash
  236. * table since it is likely to be reused but we also bump nfree to indicate
  237. * that the entry can be reallocated for a different neighbor. We also drop
  238. * the existing neighbor reference in case the neighbor is going away and is
  239. * waiting on our reference.
  240. *
  241. * Because entries can be reallocated to other neighbors once their ref count
  242. * drops to 0 we need to take the entry's lock to avoid races with a new
  243. * incarnation.
  244. */
  245. void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e)
  246. {
  247. spin_lock_bh(&e->lock);
  248. if (atomic_read(&e->refcnt) == 0) { /* hasn't been recycled */
  249. if (e->neigh) {
  250. neigh_release(e->neigh);
  251. e->neigh = NULL;
  252. }
  253. }
  254. spin_unlock_bh(&e->lock);
  255. atomic_inc(&d->nfree);
  256. }
  257. EXPORT_SYMBOL(t3_l2e_free);
  258. /*
  259. * Update an L2T entry that was previously used for the same next hop as neigh.
  260. * Must be called with softirqs disabled.
  261. */
  262. static inline void reuse_entry(struct l2t_entry *e, struct neighbour *neigh)
  263. {
  264. unsigned int nud_state;
  265. spin_lock(&e->lock); /* avoid race with t3_l2t_free */
  266. if (neigh != e->neigh)
  267. neigh_replace(e, neigh);
  268. nud_state = neigh->nud_state;
  269. if (memcmp(e->dmac, neigh->ha, sizeof(e->dmac)) ||
  270. !(nud_state & NUD_VALID))
  271. e->state = L2T_STATE_RESOLVING;
  272. else if (nud_state & NUD_CONNECTED)
  273. e->state = L2T_STATE_VALID;
  274. else
  275. e->state = L2T_STATE_STALE;
  276. spin_unlock(&e->lock);
  277. }
  278. struct l2t_entry *t3_l2t_get(struct t3cdev *cdev, struct neighbour *neigh,
  279. struct net_device *dev)
  280. {
  281. struct l2t_entry *e;
  282. struct l2t_data *d = L2DATA(cdev);
  283. u32 addr = *(u32 *) neigh->primary_key;
  284. int ifidx = neigh->dev->ifindex;
  285. int hash = arp_hash(addr, ifidx, d);
  286. struct port_info *p = netdev_priv(dev);
  287. int smt_idx = p->port_id;
  288. write_lock_bh(&d->lock);
  289. for (e = d->l2tab[hash].first; e; e = e->next)
  290. if (e->addr == addr && e->ifindex == ifidx &&
  291. e->smt_idx == smt_idx) {
  292. l2t_hold(d, e);
  293. if (atomic_read(&e->refcnt) == 1)
  294. reuse_entry(e, neigh);
  295. goto done;
  296. }
  297. /* Need to allocate a new entry */
  298. e = alloc_l2e(d);
  299. if (e) {
  300. spin_lock(&e->lock); /* avoid race with t3_l2t_free */
  301. e->next = d->l2tab[hash].first;
  302. d->l2tab[hash].first = e;
  303. e->state = L2T_STATE_RESOLVING;
  304. e->addr = addr;
  305. e->ifindex = ifidx;
  306. e->smt_idx = smt_idx;
  307. atomic_set(&e->refcnt, 1);
  308. neigh_replace(e, neigh);
  309. if (neigh->dev->priv_flags & IFF_802_1Q_VLAN)
  310. e->vlan = VLAN_DEV_INFO(neigh->dev)->vlan_id;
  311. else
  312. e->vlan = VLAN_NONE;
  313. spin_unlock(&e->lock);
  314. }
  315. done:
  316. write_unlock_bh(&d->lock);
  317. return e;
  318. }
  319. EXPORT_SYMBOL(t3_l2t_get);
  320. /*
  321. * Called when address resolution fails for an L2T entry to handle packets
  322. * on the arpq head. If a packet specifies a failure handler it is invoked,
  323. * otherwise the packets is sent to the offload device.
  324. *
  325. * XXX: maybe we should abandon the latter behavior and just require a failure
  326. * handler.
  327. */
  328. static void handle_failed_resolution(struct t3cdev *dev, struct sk_buff *arpq)
  329. {
  330. while (arpq) {
  331. struct sk_buff *skb = arpq;
  332. struct l2t_skb_cb *cb = L2T_SKB_CB(skb);
  333. arpq = skb->next;
  334. skb->next = NULL;
  335. if (cb->arp_failure_handler)
  336. cb->arp_failure_handler(dev, skb);
  337. else
  338. cxgb3_ofld_send(dev, skb);
  339. }
  340. }
  341. /*
  342. * Called when the host's ARP layer makes a change to some entry that is
  343. * loaded into the HW L2 table.
  344. */
  345. void t3_l2t_update(struct t3cdev *dev, struct neighbour *neigh)
  346. {
  347. struct l2t_entry *e;
  348. struct sk_buff *arpq = NULL;
  349. struct l2t_data *d = L2DATA(dev);
  350. u32 addr = *(u32 *) neigh->primary_key;
  351. int ifidx = neigh->dev->ifindex;
  352. int hash = arp_hash(addr, ifidx, d);
  353. read_lock_bh(&d->lock);
  354. for (e = d->l2tab[hash].first; e; e = e->next)
  355. if (e->addr == addr && e->ifindex == ifidx) {
  356. spin_lock(&e->lock);
  357. goto found;
  358. }
  359. read_unlock_bh(&d->lock);
  360. return;
  361. found:
  362. read_unlock(&d->lock);
  363. if (atomic_read(&e->refcnt)) {
  364. if (neigh != e->neigh)
  365. neigh_replace(e, neigh);
  366. if (e->state == L2T_STATE_RESOLVING) {
  367. if (neigh->nud_state & NUD_FAILED) {
  368. arpq = e->arpq_head;
  369. e->arpq_head = e->arpq_tail = NULL;
  370. } else if (neigh_is_connected(neigh))
  371. setup_l2e_send_pending(dev, NULL, e);
  372. } else {
  373. e->state = neigh_is_connected(neigh) ?
  374. L2T_STATE_VALID : L2T_STATE_STALE;
  375. if (memcmp(e->dmac, neigh->ha, 6))
  376. setup_l2e_send_pending(dev, NULL, e);
  377. }
  378. }
  379. spin_unlock_bh(&e->lock);
  380. if (arpq)
  381. handle_failed_resolution(dev, arpq);
  382. }
  383. struct l2t_data *t3_init_l2t(unsigned int l2t_capacity)
  384. {
  385. struct l2t_data *d;
  386. int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry);
  387. d = cxgb_alloc_mem(size);
  388. if (!d)
  389. return NULL;
  390. d->nentries = l2t_capacity;
  391. d->rover = &d->l2tab[1]; /* entry 0 is not used */
  392. atomic_set(&d->nfree, l2t_capacity - 1);
  393. rwlock_init(&d->lock);
  394. for (i = 0; i < l2t_capacity; ++i) {
  395. d->l2tab[i].idx = i;
  396. d->l2tab[i].state = L2T_STATE_UNUSED;
  397. spin_lock_init(&d->l2tab[i].lock);
  398. atomic_set(&d->l2tab[i].refcnt, 0);
  399. }
  400. return d;
  401. }
  402. void t3_free_l2t(struct l2t_data *d)
  403. {
  404. cxgb_free_mem(d);
  405. }