diff options
author | Linus Torvalds <torvalds@g5.osdl.org> | 2005-11-07 08:05:11 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-11-07 08:05:11 -0800 |
commit | 8e33ba49765484bc6de3a2f8143733713fa93bc1 (patch) | |
tree | 2ea080e478e4ee86a893b75db2d5c81ce14cbf10 | |
parent | 8cde0776ec1e86c270f65bf482f96288e6bf0023 (diff) | |
parent | 2d43f1128a4282fbe8442f40b4cbbac05d8f10aa (diff) |
Merge master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6
-rw-r--r-- | include/linux/pkt_sched.h | 50 | ||||
-rw-r--r-- | include/linux/skbuff.h | 38 | ||||
-rw-r--r-- | include/net/inet_ecn.h | 28 | ||||
-rw-r--r-- | include/net/inet_hashtables.h | 2 | ||||
-rw-r--r-- | include/net/red.h | 325 | ||||
-rw-r--r-- | net/core/stream.c | 12 | ||||
-rw-r--r-- | net/dccp/ipv4.c | 32 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 14 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_conntrack_helper_pptp.c | 4 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_conntrack_netlink.c | 19 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_nat_core.c | 6 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_nat_helper_pptp.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_nat_proto_gre.c | 4 | ||||
-rw-r--r-- | net/ipv4/netfilter/ip_nat_proto_unknown.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/ipt_CONNMARK.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 2 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 15 | ||||
-rw-r--r-- | net/netfilter/nf_queue.c | 2 | ||||
-rw-r--r-- | net/netfilter/nfnetlink_log.c | 6 | ||||
-rw-r--r-- | net/netfilter/nfnetlink_queue.c | 6 | ||||
-rw-r--r-- | net/sched/sch_gred.c | 841 | ||||
-rw-r--r-- | net/sched/sch_netem.c | 122 | ||||
-rw-r--r-- | net/sched/sch_red.c | 418 |
24 files changed, 1066 insertions, 886 deletions
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 60ffcb9c579..e87b233615b 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -93,6 +93,7 @@ struct tc_fifo_qopt /* PRIO section */ #define TCQ_PRIO_BANDS 16 +#define TCQ_MIN_PRIO_BANDS 2 struct tc_prio_qopt { @@ -169,6 +170,7 @@ struct tc_red_qopt unsigned char Scell_log; /* cell size for idle damping */ unsigned char flags; #define TC_RED_ECN 1 +#define TC_RED_HARDDROP 2 }; struct tc_red_xstats @@ -194,38 +196,34 @@ enum #define TCA_GRED_MAX (__TCA_GRED_MAX - 1) -#define TCA_SET_OFF TCA_GRED_PARMS struct tc_gred_qopt { - __u32 limit; /* HARD maximal queue length (bytes) -*/ - __u32 qth_min; /* Min average length threshold (bytes) -*/ - __u32 qth_max; /* Max average length threshold (bytes) -*/ - __u32 DP; /* upto 2^32 DPs */ - __u32 backlog; - __u32 qave; - __u32 forced; - __u32 early; - __u32 other; - __u32 pdrop; - - unsigned char Wlog; /* log(W) */ - unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ - unsigned char Scell_log; /* cell size for idle damping */ - __u8 prio; /* prio of this VQ */ - __u32 packets; - __u32 bytesin; + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + __u32 DP; /* upto 2^32 DPs */ + __u32 backlog; + __u32 qave; + __u32 forced; + __u32 early; + __u32 other; + __u32 pdrop; + __u8 Wlog; /* log(W) */ + __u8 Plog; /* log(P_max/(qth_max-qth_min)) */ + __u8 Scell_log; /* cell size for idle damping */ + __u8 prio; /* prio of this VQ */ + __u32 packets; + __u32 bytesin; }; + /* gred setup */ struct tc_gred_sopt { - __u32 DPs; - __u32 def_DP; - __u8 grio; - __u8 pad1; - __u16 pad2; + __u32 DPs; + __u32 def_DP; + __u8 grio; + __u8 flags; + __u16 pad1; }; /* HTB section */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4286d832166..fdfb8fe8c38 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -603,23 +603,23 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) */ /** - * __skb_queue_head - queue a buffer at the list head + * __skb_queue_after - queue a buffer at the list head * @list: list to use + * @prev: place after this buffer * @newsk: buffer to queue * - * Queue a buffer at the start of a list. This function takes no locks + * Queue a buffer int the middle of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ -extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); -static inline void __skb_queue_head(struct sk_buff_head *list, - struct sk_buff *newsk) +static inline void __skb_queue_after(struct sk_buff_head *list, + struct sk_buff *prev, + struct sk_buff *newsk) { - struct sk_buff *prev, *next; - + struct sk_buff *next; list->qlen++; - prev = (struct sk_buff *)list; + next = prev->next; newsk->next = next; newsk->prev = prev; @@ -627,6 +627,23 @@ static inline void __skb_queue_head(struct sk_buff_head *list, } /** + * __skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of a list. This function takes no locks + * and you must therefore hold required locks before calling it. + * + * A buffer cannot be placed on two lists at the same time. + */ +extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); +static inline void __skb_queue_head(struct sk_buff_head *list, + struct sk_buff *newsk) +{ + __skb_queue_after(list, (struct sk_buff *)list, newsk); +} + +/** * __skb_queue_tail - queue a buffer at the list tail * @list: list to use * @newsk: buffer to queue @@ -1203,6 +1220,11 @@ static inline void kunmap_skb_frag(void *vaddr) prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \ skb = skb->next) +#define skb_queue_reverse_walk(queue, skb) \ + for (skb = (queue)->prev; \ + prefetch(skb->prev), (skb != (struct sk_buff *)(queue)); \ + skb = skb->prev) + extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err); diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index f87845e2e96..b0c47e2eccf 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -2,6 +2,7 @@ #define _INET_ECN_H_ #include <linux/ip.h> +#include <linux/skbuff.h> #include <net/dsfield.h> enum { @@ -48,7 +49,7 @@ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) (label) |= __constant_htons(INET_ECN_ECT_0 << 4); \ } while (0) -static inline void IP_ECN_set_ce(struct iphdr *iph) +static inline int IP_ECN_set_ce(struct iphdr *iph) { u32 check = iph->check; u32 ecn = (iph->tos + 1) & INET_ECN_MASK; @@ -61,7 +62,7 @@ static inline void IP_ECN_set_ce(struct iphdr *iph) * INET_ECN_CE => 00 */ if (!(ecn & 2)) - return; + return !ecn; /* * The following gives us: @@ -72,6 +73,7 @@ static inline void IP_ECN_set_ce(struct iphdr *iph) iph->check = check + (check>=0xFFFF); iph->tos |= INET_ECN_CE; + return 1; } static inline void IP_ECN_clear(struct iphdr *iph) @@ -87,11 +89,12 @@ static inline void ipv4_copy_dscp(struct iphdr *outer, struct iphdr *inner) struct ipv6hdr; -static inline void IP6_ECN_set_ce(struct ipv6hdr *iph) +static inline int IP6_ECN_set_ce(struct ipv6hdr *iph) { if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph))) - return; + return 0; *(u32*)iph |= htonl(INET_ECN_CE << 20); + return 1; } static inline void IP6_ECN_clear(struct ipv6hdr *iph) @@ -105,4 +108,21 @@ static inline void ipv6_copy_dscp(struct ipv6hdr *outer, struct ipv6hdr *inner) ipv6_change_dsfield(inner, INET_ECN_MASK, dscp); } +static inline int INET_ECN_set_ce(struct sk_buff *skb) +{ + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + if (skb->nh.raw + sizeof(struct iphdr) <= skb->tail) + return IP_ECN_set_ce(skb->nh.iph); + break; + + case __constant_htons(ETH_P_IPV6): + if (skb->nh.raw + sizeof(struct ipv6hdr) <= skb->tail) + return IP6_ECN_set_ce(skb->nh.ipv6h); + break; + } + + return 0; +} + #endif diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index f50f9596834..07840baa934 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -125,9 +125,7 @@ struct inet_hashinfo { rwlock_t lhash_lock ____cacheline_aligned; atomic_t lhash_users; wait_queue_head_t lhash_wait; - spinlock_t portalloc_lock; kmem_cache_t *bind_bucket_cachep; - int port_rover; }; static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, diff --git a/include/net/red.h b/include/net/red.h new file mode 100644 index 00000000000..2ed4358e329 --- /dev/null +++ b/include/net/red.h @@ -0,0 +1,325 @@ +#ifndef __NET_SCHED_RED_H +#define __NET_SCHED_RED_H + +#include <linux/config.h> +#include <linux/types.h> +#include <net/pkt_sched.h> +#include <net/inet_ecn.h> +#include <net/dsfield.h> + +/* Random Early Detection (RED) algorithm. + ======================================= + + Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways + for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. + + This file codes a "divisionless" version of RED algorithm + as written down in Fig.17 of the paper. + + Short description. + ------------------ + + When a new packet arrives we calculate the average queue length: + + avg = (1-W)*avg + W*current_queue_len, + + W is the filter time constant (chosen as 2^(-Wlog)), it controls + the inertia of the algorithm. To allow larger bursts, W should be + decreased. + + if (avg > th_max) -> packet marked (dropped). + if (avg < th_min) -> packet passes. + if (th_min < avg < th_max) we calculate probability: + + Pb = max_P * (avg - th_min)/(th_max-th_min) + + and mark (drop) packet with this probability. + Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). + max_P should be small (not 1), usually 0.01..0.02 is good value. + + max_P is chosen as a number, so that max_P/(th_max-th_min) + is a negative power of two in order arithmetics to contain + only shifts. + + + Parameters, settable by user: + ----------------------------- + + qth_min - bytes (should be < qth_max/2) + qth_max - bytes (should be at least 2*qth_min and less limit) + Wlog - bits (<32) log(1/W). + Plog - bits (<32) + + Plog is related to max_P by formula: + + max_P = (qth_max-qth_min)/2^Plog; + + F.e. if qth_max=128K and qth_min=32K, then Plog=22 + corresponds to max_P=0.02 + + Scell_log + Stab + + Lookup table for log((1-W)^(t/t_ave). + + + NOTES: + + Upper bound on W. + ----------------- + + If you want to allow bursts of L packets of size S, + you should choose W: + + L + 1 - th_min/S < (1-(1-W)^L)/W + + th_min/S = 32 th_min/S = 4 + + log(W) L + -1 33 + -2 35 + -3 39 + -4 46 + -5 57 + -6 75 + -7 101 + -8 135 + -9 190 + etc. + */ + +#define RED_STAB_SIZE 256 +#define RED_STAB_MASK (RED_STAB_SIZE - 1) + +struct red_stats +{ + u32 prob_drop; /* Early probability drops */ + u32 prob_mark; /* Early probability marks */ + u32 forced_drop; /* Forced drops, qavg > max_thresh */ + u32 forced_mark; /* Forced marks, qavg > max_thresh */ + u32 pdrop; /* Drops due to queue limits */ + u32 other; /* Drops due to drop() calls */ + u32 backlog; +}; + +struct red_parms +{ + /* Parameters */ + u32 qth_min; /* Min avg length threshold: A scaled */ + u32 qth_max; /* Max avg length threshold: A scaled */ + u32 Scell_max; + u32 Rmask; /* Cached random mask, see red_rmask */ + u8 Scell_log; + u8 Wlog; /* log(W) */ + u8 Plog; /* random number bits */ + u8 Stab[RED_STAB_SIZE]; + + /* Variables */ + int qcount; /* Number of packets since last random + number generation */ + u32 qR; /* Cached random number */ + + unsigned long qavg; /* Average queue length: A scaled */ + psched_time_t qidlestart; /* Start of current idle period */ +}; + +static inline u32 red_rmask(u8 Plog) +{ + return Plog < 32 ? ((1 << Plog) - 1) : ~0UL; +} + +static inline void red_set_parms(struct red_parms *p, + u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog, + u8 Scell_log, u8 *stab) +{ + /* Reset average queue length, the value is strictly bound + * to the parameters below, reseting hurts a bit but leaving + * it might result in an unreasonable qavg for a while. --TGR + */ + p->qavg = 0; + + p->qcount = -1; + p->qth_min = qth_min << Wlog; + p->qth_max = qth_max << Wlog; + p->Wlog = Wlog; + p->Plog = Plog; + p->Rmask = red_rmask(Plog); + p->Scell_log = Scell_log; + p->Scell_max = (255 << Scell_log); + + memcpy(p->Stab, stab, sizeof(p->Stab)); +} + +static inline int red_is_idling(struct red_parms *p) +{ + return !PSCHED_IS_PASTPERFECT(p->qidlestart); +} + +static inline void red_start_of_idle_period(struct red_parms *p) +{ + PSCHED_GET_TIME(p->qidlestart); +} + +static inline void red_end_of_idle_period(struct red_parms *p) +{ + PSCHED_SET_PASTPERFECT(p->qidlestart); +} + +static inline void red_restart(struct red_parms *p) +{ + red_end_of_idle_period(p); + p->qavg = 0; + p->qcount = -1; +} + +static inline unsigned long red_calc_qavg_from_idle_time(struct red_parms *p) +{ + psched_time_t now; + long us_idle; + int shift; + + PSCHED_GET_TIME(now); + us_idle = PSCHED_TDIFF_SAFE(now, p->qidlestart, p->Scell_max); + + /* + * The problem: ideally, average length queue recalcultion should + * be done over constant clock intervals. This is too expensive, so + * that the calculation is driven by outgoing packets. + * When the queue is idle we have to model this clock by hand. + * + * SF+VJ proposed to "generate": + * + * m = idletime / (average_pkt_size / bandwidth) + * + * dummy packets as a burst after idle time, i.e. + * + * p->qavg *= (1-W)^m + * + * This is an apparently overcomplicated solution (f.e. we have to + * precompute a table to make this calculation in reasonable time) + * I believe that a simpler model may be used here, + * but it is field for experiments. + */ + + shift = p->Stab[(us_idle >> p->Scell_log) & RED_STAB_MASK]; + + if (shift) + return p->qavg >> shift; + else { + /* Approximate initial part of exponent with linear function: + * + * (1-W)^m ~= 1-mW + ... + * + * Seems, it is the best solution to + * problem of too coarse exponent tabulation. + */ + us_idle = (p->qavg * us_idle) >> p->Scell_log; + + if (us_idle < (p->qavg >> 1)) + return p->qavg - us_idle; + else + return p->qavg >> 1; + } +} + +static inline unsigned long red_calc_qavg_no_idle_time(struct red_parms *p, + unsigned int backlog) +{ + /* + * NOTE: p->qavg is fixed point number with point at Wlog. + * The formula below is equvalent to floating point + * version: + * + * qavg = qavg*(1-W) + backlog*W; + * + * --ANK (980924) + */ + return p->qavg + (backlog - (p->qavg >> p->Wlog)); +} + +static inline unsigned long red_calc_qavg(struct red_parms *p, + unsigned int backlog) +{ + if (!red_is_idling(p)) + return red_calc_qavg_no_idle_time(p, backlog); + else + return red_calc_qavg_from_idle_time(p); +} + +static inline u32 red_random(struct red_parms *p) +{ + return net_random() & p->Rmask; +} + +static inline int red_mark_probability(struct red_parms *p, unsigned long qavg) +{ + /* The formula used below causes questions. + + OK. qR is random number in the interval 0..Rmask + i.e. 0..(2^Plog). If we used floating point + arithmetics, it would be: (2^Plog)*rnd_num, + where rnd_num is less 1. + + Taking into account, that qavg have fixed + point at Wlog, and Plog is related to max_P by + max_P = (qth_max-qth_min)/2^Plog; two lines + below have the following floating point equivalent: + + max_P*(qavg - qth_min)/(qth_max-qth_min) < rnd/qcount + + Any questions? --ANK (980924) + */ + return !(((qavg - p->qth_min) >> p->Wlog) * p->qcount < p->qR); +} + +enum { + RED_BELOW_MIN_THRESH, + RED_BETWEEN_TRESH, + RED_ABOVE_MAX_TRESH, +}; + +static inline int red_cmp_thresh(struct red_parms *p, unsigned long qavg) +{ + if (qavg < p->qth_min) + return RED_BELOW_MIN_THRESH; + else if (qavg >= p->qth_max) + return RED_ABOVE_MAX_TRESH; + else + return RED_BETWEEN_TRESH; +} + +enum { + RED_DONT_MARK, + RED_PROB_MARK, + RED_HARD_MARK, +}; + +static inline int red_action(struct red_parms *p, unsigned long qavg) +{ + switch (red_cmp_thresh(p, qavg)) { + case RED_BELOW_MIN_THRESH: + p->qcount = -1; + return RED_DONT_MARK; + + case RED_BETWEEN_TRESH: + if (++p->qcount) { + if (red_mark_probability(p, qavg)) { + p->qcount = 0; + p->qR = red_random(p); + return RED_PROB_MARK; + } + } else + p->qR = red_random(p); + + return RED_DONT_MARK; + + case RED_ABOVE_MAX_TRESH: + p->qcount = -1; + return RED_HARD_MARK; + } + + BUG(); + return RED_DONT_MARK; +} + +#endif diff --git a/net/core/stream.c b/net/core/stream.c index ac9edfdf874..15bfd03e802 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -52,8 +52,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) { struct task_struct *tsk = current; DEFINE_WAIT(wait); + int done; - while (1) { + do { if (sk->sk_err) return sock_error(sk); if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) @@ -65,13 +66,12 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; - if (sk_wait_event(sk, timeo_p, - !((1 << sk->sk_state) & - ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))) - break; + done = sk_wait_event(sk, timeo_p, + !((1 << sk->sk_state) & + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); finish_wait(sk->sk_sleep, &wait); sk->sk_write_pending--; - } + } while (!done); return 0; } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 6298cf58ff9..4b9bc81ae1a 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -31,8 +31,6 @@ struct inet_hashinfo __cacheline_aligned dccp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, .lhash_users = ATOMIC_INIT(0), .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), - .portalloc_lock = SPIN_LOCK_UNLOCKED, - .port_rover = 1024 - 1, }; EXPORT_SYMBOL_GPL(dccp_hashinfo); @@ -125,36 +123,15 @@ static int dccp_v4_hash_connect(struct sock *sk) int ret; if (snum == 0) { - int rover; int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; + int rover = net_random() % (high - low) + low; struct hlist_node *node; struct inet_timewait_sock *tw = NULL; local_bh_disable(); - - /* TODO. Actually it is not so bad idea to remove - * dccp_hashinfo.portalloc_lock before next submission to - * Linus. - * As soon as we touch this place at all it is time to think. - * - * Now it protects single _advisory_ variable - * dccp_hashinfo.port_rover, hence it is mostly useless. - * Code will work nicely if we just delete it, but - * I am afraid in contented case it will work not better or - * even worse: another cpu just will hit the same bucket - * and spin there. - * So some cpu salt could remove both contention and - * memory pingpong. Any ideas how to do this in a nice way? - */ - spin_lock(&dccp_hashinfo.portalloc_lock); - rover = dccp_hashinfo.port_rover; - do { - rover++; - if ((rover < low) || (rover > high)) - rover = low; head = &dccp_hashinfo.bhash[inet_bhashfn(rover, dccp_hashinfo.bhash_size)]; spin_lock(&head->lock); @@ -187,9 +164,9 @@ static int dccp_v4_hash_connect(struct sock *sk) next_port: spin_unlock(&head->lock); + if (++rover > high) + rover = low; } while (--remaining > 0); - dccp_hashinfo.port_rover = rover; - spin_unlock(&dccp_hashinfo.portalloc_lock); local_bh_enable(); @@ -197,9 +174,6 @@ static int dccp_v4_hash_connect(struct sock *sk) ok: /* All locks still held and bhs disabled */ - dccp_hashinfo.port_rover = rover; - spin_unlock(&dccp_hashinfo.portalloc_lock); - inet_bind_hash(sk, tb, rover); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(rover); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 94468a76c5b..3fe021f1a56 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -78,17 +78,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; - int rover; + int rover = net_random() % (high - low) + low; - spin_lock(&hashinfo->portalloc_lock); - if (hashinfo->port_rover < low) - rover = low; - else - rover = hashinfo->port_rover; do { - rover++; - if (rover > high) - rover = low; head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) @@ -97,9 +89,9 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, break; next: spin_unlock(&head->lock); + if (++rover > high) + rover = low; } while (--remaining > 0); - hashinfo->port_rover = rover; - spin_unlock(&hashinfo->portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c index 926a6684643..4108a5e12b3 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c @@ -270,14 +270,10 @@ exp_gre(struct ip_conntrack *master, exp_orig->expectfn = pptp_expectfn; exp_orig->flags = 0; - exp_orig->dir = IP_CT_DIR_ORIGINAL; - /* both expectations are identical apart from tuple */ memcpy(exp_reply, exp_orig, sizeof(*exp_reply)); memcpy(&exp_reply->tuple, &exp_tuples[1], sizeof(exp_reply->tuple)); - exp_reply->dir = !exp_orig->dir; - if (ip_nat_pptp_hook_exp_gre) ret = ip_nat_pptp_hook_exp_gre(exp_orig, exp_reply); else { diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 166e6069f12..82a65043a8e 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -815,7 +815,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, IPCTNL_MSG_CT_NEW, 1, ct); ip_conntrack_put(ct); if (err <= 0) - goto out; + goto free; err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); if (err < 0) @@ -824,9 +824,9 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, DEBUGP("leaving\n"); return 0; +free: + kfree_skb(skb2); out: - if (skb2) - kfree_skb(skb2); return -1; } @@ -1322,21 +1322,16 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, 1, exp); if (err <= 0) - goto out; + goto free; ip_conntrack_expect_put(exp); - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); - if (err < 0) - goto free; - - return err; + return netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); +free: + kfree_skb(skb2); out: ip_conntrack_expect_put(exp); -free: - if (skb2) - kfree_skb(skb2); return err; } diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index c5e3abd2467..762f4d93936 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -66,10 +66,8 @@ ip_nat_proto_find_get(u_int8_t protonum) * removed until we've grabbed the reference */ preempt_disable(); p = __ip_nat_proto_find(protonum); - if (p) { - if (!try_module_get(p->me)) - p = &ip_nat_unknown_protocol; - } + if (!try_module_get(p->me)) + p = &ip_nat_unknown_protocol; preempt_enable(); return p; diff --git a/net/ipv4/netfilter/ip_nat_helper_pptp.c b/net/ipv4/netfilter/ip_nat_helper_pptp.c index 3cdd0684d30..ee6ab74ad3a 100644 --- a/net/ipv4/netfilter/ip_nat_helper_pptp.c +++ b/net/ipv4/netfilter/ip_nat_helper_pptp.c @@ -216,6 +216,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, expect_orig->saved_proto.gre.key = htons(nat_pptp_info->pac_call_id); expect_orig->tuple.src.u.gre.key = htons(nat_pptp_info->pns_call_id); expect_orig->tuple.dst.u.gre.key = htons(ct_pptp_info->pac_call_id); + expect_orig->dir = IP_CT_DIR_ORIGINAL; inv_t.src.ip = reply_t->src.ip; inv_t.dst.ip = reply_t->dst.ip; inv_t.src.u.gre.key = htons(nat_pptp_info->pac_call_id); @@ -233,6 +234,7 @@ pptp_exp_gre(struct ip_conntrack_expect *expect_orig, expect_reply->saved_proto.gre.key = htons(nat_pptp_info->pns_call_id); expect_reply->tuple.src.u.gre.key = htons(nat_pptp_info->pac_call_id); expect_reply->tuple.dst.u.gre.key = htons(ct_pptp_info->pns_call_id); + expect_reply->dir = IP_CT_DIR_REPLY; inv_t.src.ip = orig_t->src.ip; inv_t.dst.ip = orig_t->dst.ip; inv_t.src.u.gre.key = htons(nat_pptp_info->pns_call_id); diff --git a/net/ipv4/netfilter/ip_nat_proto_gre.c b/net/ipv4/netfilter/ip_nat_proto_gre.c index 7c128540167..f7cad7cf1ae 100644 --- a/net/ipv4/netfilter/ip_nat_proto_gre.c +++ b/net/ipv4/netfilter/ip_nat_proto_gre.c @@ -139,8 +139,8 @@ gre_manip_pkt(struct sk_buff **pskb, break; case GRE_VERSION_PPTP: DEBUGP("call_id -> 0x%04x\n", - ntohl(tuple->dst.u.gre.key)); - pgreh->call_id = htons(ntohl(tuple->dst.u.gre.key)); + ntohs(tuple->dst.u.gre.key)); + pgreh->call_id = tuple->dst.u.gre.key; break; default: DEBUGP("can't nat unknown GRE version\n"); diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index 99bbef56f84..f0099a646a0 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -62,7 +62,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) struct ip_nat_protocol ip_nat_unknown_protocol = { .name = "unknown", - .me = THIS_MODULE, + /* .me isn't set: getting a ref to this cannot fail. */ .manip_pkt = unknown_manip_pkt, .in_range = unknown_in_range, .unique_tuple = unknown_unique_tuple, diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 13463802133..05d66ab5942 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -109,6 +109,7 @@ static struct ipt_target ipt_connmark_reg = { static int __init init(void) { + need_ip_conntrack(); return ipt_register_target(&ipt_connmark_reg); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f3f0013a958..72b7c22e1ea 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2112,7 +2112,6 @@ void __init tcp_init(void) sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } - tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; sysctl_tcp_mem[0] = 768 << order; sysctl_tcp_mem[1] = 1024 << order; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c85819d8474..49d67cd75ed 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -93,8 +93,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, .lhash_users = ATOMIC_INIT(0), .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), - .portalloc_lock = SPIN_LOCK_UNLOCKED, - .port_rover = 1024 - 1, }; static int tcp_v4_get_port(struct sock *sk, unsigned short snum) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d693cb988b7..d746d3b27ef 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -114,16 +114,9 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; |