Patches and procedure to have client and server on same machine.



I sent this email (below) to a developer working on trying to fix the kernel bug mentioned here. The LAST_ACK issue does clear itself up after a few minutes, so as long as you don't make lots of connections in a short amount of time, you will be fine....

This is all based on what our commercial traffic generator product does in
a more automated fashion.

Instructions and patches and scripts follow.


-------- Original Message --------

After starting/stopping a TCP connection several times, I see it get hung
with one side in LAST_ACK state and the other in TIME_WAIT.  I only see this
when I am sending to myself with the attached send-to-self patch, so it's likely
that either this patch is not completely right, or that the action is only triggerable
when sending to yourself for some reason.

You need a patched iperf that allows binding to a local device.  The iperf patch
is attached (adds support for the -E argument).

You need to patch the kernel with the sts patch.  This allows send-to-self.

Then, run the sts_script.sh, which sets up source based routing on eth0 and eth1.
(You may need to edit this script based on your own network devices, etc.)

eth0 should be connected to eth1 with a loopback cable.

After running the sts_script.sh, launch two instances of the modified iperf:

# Run server instance on eth0
iperf -B 10.2.0.1 -E eth0 -s

# And client on eth1.
iperf -B 10.2.0.2 -E eth1 -c 10.2.0.1


It took me 4 tries of starting/stopping the client before I saw the LAST_ACK state in netstat. That is the symptom of the bug.


Thanks, Ben

--
Ben Greear <greearb --at-- candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

Attachment: sts_script.sh
Description: application/shellscript

diff -urN iperf-2.0.2/include/Settings.hpp iperf-2.0.2.sts/include/Settings.hpp
--- iperf-2.0.2/include/Settings.hpp	2005-05-02 13:09:26.000000000 -0700
+++ iperf-2.0.2.sts/include/Settings.hpp	2007-05-18 14:46:31.000000000 -0700
@@ -112,6 +112,7 @@
     char*  mHost;                   // -c
     char*  mLocalhost;              // -B
     char*  mOutputFileName;         // -o
+    char*  mBindDev;         // -E eth0,  --bind_dev eth0
     FILE*  Extractor_file;
     ReportHeader*  reporthdr;
     MultiHeader*   multihdr;
diff -urN iperf-2.0.2/src/Client.cpp iperf-2.0.2.sts/src/Client.cpp
--- iperf-2.0.2/src/Client.cpp	2005-05-02 13:09:27.000000000 -0700
+++ iperf-2.0.2.sts/src/Client.cpp	2007-05-18 14:49:28.000000000 -0700
@@ -318,6 +318,16 @@
         WARN_errno( rc == SOCKET_ERROR, "bind" );
     }
 
+#ifndef WIN32
+    // Bind to a local device.  This will fail if not user root and may only work on Linux.
+    if ( mSettings->mBindDev != NULL ) {
+       if (setsockopt(mSettings->mSock, SOL_SOCKET, SO_BINDTODEVICE,
+                      mSettings->mBindDev, 16)) {
+          WARN_errno( rc == SOCKET_ERROR, "setsockopt-SO_BINDTODEVICE" );
+       }
+    }
+#endif
+
     // connect socket
     rc = connect( mSettings->mSock, (sockaddr*) &mSettings->peer, 
                   SockAddr_get_sizeof_sockaddr( &mSettings->peer ));
diff -urN iperf-2.0.2/src/Listener.cpp iperf-2.0.2.sts/src/Listener.cpp
--- iperf-2.0.2/src/Listener.cpp	2005-05-02 13:09:27.000000000 -0700
+++ iperf-2.0.2.sts/src/Listener.cpp	2007-05-18 14:49:30.000000000 -0700
@@ -323,6 +323,16 @@
     Socklen_t len = sizeof(boolean);
     setsockopt( mSettings->mSock, SOL_SOCKET, SO_REUSEADDR, (char*) &boolean, len );
 
+#ifndef WIN32
+    // Bind to a local device.  This will fail if not user root and may only work on Linux.
+    if ( mSettings->mBindDev != NULL ) {
+       if (setsockopt(mSettings->mSock, SOL_SOCKET, SO_BINDTODEVICE,
+                      mSettings->mBindDev, 16)) {
+          WARN_errno( rc == SOCKET_ERROR, "setsockopt-SO_BINDTODEVICE" );
+       }
+    }
+#endif
+
     // bind socket to server address
 #ifdef WIN32
     if ( SockAddr_isMulticast( &mSettings->local ) ) {
diff -urN iperf-2.0.2/src/Locale.c iperf-2.0.2.sts/src/Locale.c
--- iperf-2.0.2/src/Locale.c	2005-05-02 13:09:27.000000000 -0700
+++ iperf-2.0.2.sts/src/Locale.c	2007-05-18 14:52:29.000000000 -0700
@@ -139,6 +139,7 @@
   -u, --udp                use UDP rather than TCP\n\
   -w, --window    #[KM]    TCP window size (socket buffer size)\n\
   -B, --bind      <host>   bind to <host>, an interface or multicast address\n\
+  -E, --bind-dev  <dev>    bind to <device>, for example eth0  This only does the SO_BINDTODEVICE call.\n\
   -C, --compatibility      for use with older versions does not sent extra msgs\n\
   -M, --mss       #        set TCP maximum segment size (MTU - 40 bytes)\n\
   -N, --nodelay            set TCP no delay, disabling Nagle's Algorithm\n\
diff -urN iperf-2.0.2/src/Settings.cpp iperf-2.0.2.sts/src/Settings.cpp
--- iperf-2.0.2/src/Settings.cpp	2005-05-02 13:09:27.000000000 -0700
+++ iperf-2.0.2.sts/src/Settings.cpp	2007-05-18 14:37:51.000000000 -0700
@@ -108,6 +108,7 @@
 {"bind",       required_argument, NULL, 'B'},
 {"compatibility",    no_argument, NULL, 'C'},
 {"daemon",           no_argument, NULL, 'D'},
+{"bind-dev",    required_argument, NULL, 'E'},
 {"file_input", required_argument, NULL, 'F'},
 {"stdin_input",      no_argument, NULL, 'I'},
 {"mss",        required_argument, NULL, 'M'},
@@ -151,6 +152,7 @@
 {"IPERF_BIND",       required_argument, NULL, 'B'},
 {"IPERF_COMPAT",           no_argument, NULL, 'C'},
 {"IPERF_DAEMON",           no_argument, NULL, 'D'},
+{"IPERF_BIND_DEV",    required_argument, NULL, 'E'},
 {"IPERF_FILE_INPUT", required_argument, NULL, 'F'},
 {"IPERF_STDIN_INPUT",      no_argument, NULL, 'I'},
 {"IPERF_MSS",        required_argument, NULL, 'M'},
@@ -167,7 +169,7 @@
 
 #define SHORT_OPTIONS()
 
-const char short_options[] = "1b:c:df:hi:l:mn:o:p:rst:uvw:x:y:B:CDF:IL:M:NP:RS:T:UVW";
+const char short_options[] = "1b:c:df:hi:l:mn:o:p:rst:uvw:x:y:B:CDE:F:IL:M:NP:RS:T:UVW";
 
 /* -------------------------------------------------------------------
  * defaults
@@ -229,7 +231,7 @@
     main->mTTL          = 1;             // -T,  link-local TTL
     //main->mDomain     = kMode_IPv4;    // -V,
     //main->mSuggestWin = false;         // -W,  Suggest the window size.
-
+    //main->mBindDev = NULL // -E --bind-dev
 } // end Settings
 
 void Settings_Copy( thread_Settings *from, thread_Settings **into ) {
@@ -251,6 +253,10 @@
         (*into)->mFileName = new char[ strlen(from->mFileName) + 1];
         strcpy( (*into)->mFileName, from->mFileName );
     }
+    if ( from->mBindDev != NULL ) {
+        (*into)->mBindDev = new char[ strlen(from->mBindDev) + 1];
+        strcpy( (*into)->mBindDev, from->mBindDev );
+    }
     // Zero out certain entries
     (*into)->mTID = thread_zeroid();
     (*into)->runNext = NULL;
@@ -266,6 +272,7 @@
     DELETE_ARRAY( mSettings->mLocalhost );
     DELETE_ARRAY( mSettings->mFileName  );
     DELETE_ARRAY( mSettings->mOutputFileName );
+    DELETE_ARRAY( mSettings->mBindDev );
     DELETE_PTR( mSettings );
 } // end ~Settings
 
@@ -566,6 +573,11 @@
             setDaemon( mExtSettings );
             break;
 
+        case 'E' : // Bind to a particular device.
+            mExtSettings->mBindDev = new char[strlen(optarg)+1];
+            strcpy( mExtSettings->mBindDev, optarg);
+            break;
+
         case 'F' : // Get the input for the data stream from a file
             if ( mExtSettings->mThreadMode != kMode_Client ) {
                 fprintf( stderr, warn_invalid_server_option, option );
@@ -713,12 +725,17 @@
         (*listener)->mHost       = NULL;
         (*listener)->mLocalhost  = NULL;
         (*listener)->mOutputFileName = NULL;
+        (*listener)->mBindDev    = NULL;
         (*listener)->mMode       = kTest_Normal;
         (*listener)->mThreadMode = kMode_Listener;
         if ( client->mHost != NULL ) {
             (*listener)->mHost = new char[strlen( client->mHost ) + 1];
             strcpy( (*listener)->mHost, client->mHost );
         }
+        if ( client->mBindDev != NULL ) {
+            (*listener)->mBindDev = new char[strlen( client->mBindDev ) + 1];
+            strcpy( (*listener)->mBindDev, client->mBindDev );
+        }
         if ( client->mLocalhost != NULL ) {
             (*listener)->mLocalhost = new char[strlen( client->mLocalhost ) + 1];
             strcpy( (*listener)->mLocalhost, client->mLocalhost );
@@ -770,6 +787,7 @@
         }
         (*client)->mFileName   = NULL;
         (*client)->mHost       = NULL;
+        (*client)->mBindDev    = NULL;
         (*client)->mLocalhost  = NULL;
         (*client)->mOutputFileName = NULL;
         (*client)->mMode       = ((flags & RUN_NOW) == 0 ?
@@ -779,6 +797,10 @@
             (*client)->mLocalhost = new char[strlen( server->mLocalhost ) + 1];
             strcpy( (*client)->mLocalhost, server->mLocalhost );
         }
+        if ( server->mBindDev != NULL ) {
+            (*client)->mBindDev = new char[strlen( server->mBindDev ) + 1];
+            strcpy( (*client)->mBindDev, server->mBindDev );
+        }
         (*client)->mHost = new char[REPORT_ADDRLEN];
         if ( ((sockaddr*)&server->peer)->sa_family == AF_INET ) {
             inet_ntop( AF_INET, &((sockaddr_in*)&server->peer)->sin_addr, 

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index c0f7aec..88f78b6 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -31,6 +31,7 @@ struct ipv4_devconf
 	int	no_policy;
 	int	force_igmp_version;
 	int	promote_secondaries;
+	int	accept_sts;
 	void	*sysctl;
 };
 
@@ -84,6 +85,7 @@ struct in_device
 #define IN_DEV_ARPFILTER(in_dev)	(ipv4_devconf.arp_filter || (in_dev)->cnf.arp_filter)
 #define IN_DEV_ARP_ANNOUNCE(in_dev)	(max(ipv4_devconf.arp_announce, (in_dev)->cnf.arp_announce))
 #define IN_DEV_ARP_IGNORE(in_dev)	(max(ipv4_devconf.arp_ignore, (in_dev)->cnf.arp_ignore))
+#define IN_DEV_ACCEPT_STS(in_dev)      (max(ipv4_devconf.accept_sts, (in_dev)->cnf.accept_sts))
 
 struct in_ifaddr
 {
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 47f1c53..6c00bf4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -496,6 +496,7 @@ enum
 	NET_IPV4_CONF_ARP_IGNORE=19,
 	NET_IPV4_CONF_PROMOTE_SECONDARIES=20,
 	NET_IPV4_CONF_ARP_ACCEPT=21,
+	NET_IPV4_CONF_ACCEPT_STS=22,
 	__NET_IPV4_CONF_MAX
 };
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 7110779..9866f1b 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -419,6 +419,26 @@ static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
 	return !inet_confirm_addr(dev, sip, tip, scope);
 }
 
+static int is_ip_on_dev(struct net_device* dev, __u32 ip) {
+      int rv = 0;
+      struct in_device* in_dev = in_dev_get(dev);
+      if (in_dev) {
+              struct in_ifaddr *ifa;
+
+              rcu_read_lock();
+              for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+                      if (ifa->ifa_address == ip) {
+                              /* match */
+                              rv = 1;
+                              break;
+                      }
+              }
+              rcu_read_unlock();
+              in_dev_put(in_dev);
+      }
+      return rv;
+}
+
 static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
 {
 	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
@@ -430,8 +450,38 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
 	if (ip_route_output_key(&rt, &fl) < 0)
 		return 1;
 	if (rt->u.dst.dev != dev) {
-		NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
-		flag = 1;
+		struct in_device *in_dev = in_dev_get(dev);
+		if (in_dev && IN_DEV_ACCEPT_STS(in_dev) &&
+		    (rt->u.dst.dev == &loopback_dev))  {
+			/* Accept these IFF target-ip == dev's IP */
+			/* TODO:  Need to force the ARP response back out the interface
+			 * instead of letting it route locally.
+			 */
+			
+			if (is_ip_on_dev(dev, tip)) {
+				/* OK, we'll let this special case slide, so that we can
+				 * arp from one local interface to another.  This seems
+				 * to work, but could use some review. --Ben
+				 */
+				/*printk("arp_filter, sip: %x tip: %x  dev: %s, STS override (ip on dev)\n",
+                                  sip, tip, dev->name);*/
+			}
+			else {
+				/*printk("arp_filter, sip: %x tip: %x  dev: %s, IP is NOT on dev\n",
+                                  sip, tip, dev->name);*/
+				NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
+				flag = 1;
+			}
+		}
+		else {
+			/*printk("arp_filter, not lpbk  sip: %x tip: %x  dev: %s  flgs: %hx  dst.dev: %p  lbk: %p\n",
+			  sip, tip, dev->name, dev->priv_flags, rt->u.dst.dev, &loopback_dev);*/
+			NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
+			flag = 1;
+		}
+		if (in_dev) {
+			in_dev_put(in_dev);
+		}
 	}
 	ip_rt_put(rt);
 	return flag;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7f95e6e..33ac2ed 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1513,6 +1513,15 @@ static struct devinet_sysctl_table {
 			.proc_handler	= &ipv4_doint_and_flush,
 			.strategy	= &ipv4_doint_and_flush_strategy,
 		},
+		{
+			.ctl_name       = NET_IPV4_CONF_ACCEPT_STS,
+			.procname       = "accept_sts",
+			.data           = &ipv4_devconf.accept_sts,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = &proc_dointvec,
+		},
+
 	},
 	.devinet_dev = {
 		{
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 837f295..9b57bf5 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -206,8 +206,16 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 
 	if (fib_lookup(&fl, &res))
 		goto last_resort;
-	if (res.type != RTN_UNICAST)
-		goto e_inval_res;
+	if (res.type != RTN_UNICAST) {
+		if ((res.type == RTN_LOCAL) &&
+		    (IN_DEV_ACCEPT_STS(in_dev))) {
+			/* All is OK */
+		}
+		else {
+			goto e_inval_res;
+		}
+	}
+
 	*spec_dst = FIB_RES_PREFSRC(res);
 	fib_combine_itag(itag, &res);
 #ifdef CONFIG_IP_ROUTE_MULTIPATH



Other Mailing lists | Author Index | Date Index | Subject Index | Thread Index