소스 검색

get rid of NR_OPEN and introduce a sysctl_nr_open

NR_OPEN (historically set to 1024*1024) actually forbids processes to open
more than 1024*1024 handles.

Unfortunatly some production servers hit the not so 'ridiculously high
value' of 1024*1024 file descriptors per process.

Changing NR_OPEN is not considered safe because of vmalloc space potential
exhaust.

This patch introduces a new sysctl (/proc/sys/fs/nr_open) wich defaults to
1024*1024, so that admins can decide to change this limit if their workload
needs it.

[akpm@linux-foundation.org: export it for sparc64]
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Eric Dumazet 17 년 전
부모
커밋
9cfe015aa4

+ 8 - 0
Documentation/filesystems/proc.txt

@@ -1029,6 +1029,14 @@ nr_inodes
 Denotes the  number  of  inodes the system has allocated. This number will
 Denotes the  number  of  inodes the system has allocated. This number will
 grow and shrink dynamically.
 grow and shrink dynamically.
 
 
+nr_open
+-------
+
+Denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
 nr_free_inodes
 nr_free_inodes
 --------------
 --------------
 
 

+ 10 - 0
Documentation/sysctl/fs.txt

@@ -23,6 +23,7 @@ Currently, these files are in /proc/sys/fs:
 - inode-max
 - inode-max
 - inode-nr
 - inode-nr
 - inode-state
 - inode-state
+- nr_open
 - overflowuid
 - overflowuid
 - overflowgid
 - overflowgid
 - suid_dumpable
 - suid_dumpable
@@ -91,6 +92,15 @@ usage of file handles and you don't need to increase the maximum.
 
 
 ==============================================================
 ==============================================================
 
 
+nr_open:
+
+This denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
+==============================================================
+
 inode-max, inode-nr & inode-state:
 inode-max, inode-nr & inode-state:
 
 
 As with file handles, the kernel allocates the inode structures
 As with file handles, the kernel allocates the inode structures

+ 1 - 1
arch/alpha/kernel/osf_sys.c

@@ -430,7 +430,7 @@ sys_getpagesize(void)
 asmlinkage unsigned long
 asmlinkage unsigned long
 sys_getdtablesize(void)
 sys_getdtablesize(void)
 {
 {
-	return NR_OPEN;
+	return sysctl_nr_open;
 }
 }
 
 
 /*
 /*

+ 1 - 1
arch/mips/kernel/sysirix.c

@@ -356,7 +356,7 @@ asmlinkage int irix_syssgi(struct pt_regs *regs)
 			retval = NGROUPS_MAX;
 			retval = NGROUPS_MAX;
 			goto out;
 			goto out;
 		case 5:
 		case 5:
-			retval = NR_OPEN;
+			retval = sysctl_nr_open;
 			goto out;
 			goto out;
 		case 6:
 		case 6:
 			retval = 1;
 			retval = 1;

+ 1 - 0
arch/sparc64/kernel/sparc64_ksyms.c

@@ -277,6 +277,7 @@ EXPORT_SYMBOL(sys_getpid);
 EXPORT_SYMBOL(sys_geteuid);
 EXPORT_SYMBOL(sys_geteuid);
 EXPORT_SYMBOL(sys_getuid);
 EXPORT_SYMBOL(sys_getuid);
 EXPORT_SYMBOL(sys_getegid);
 EXPORT_SYMBOL(sys_getegid);
+EXPORT_SYMBOL(sysctl_nr_open);
 EXPORT_SYMBOL(sys_getgid);
 EXPORT_SYMBOL(sys_getgid);
 EXPORT_SYMBOL(svr4_getcontext);
 EXPORT_SYMBOL(svr4_getcontext);
 EXPORT_SYMBOL(svr4_setcontext);
 EXPORT_SYMBOL(svr4_setcontext);

+ 1 - 1
arch/sparc64/solaris/fs.c

@@ -624,7 +624,7 @@ asmlinkage int solaris_ulimit(int cmd, int val)
 	case 3: /* UL_GMEMLIM */
 	case 3: /* UL_GMEMLIM */
 		return current->signal->rlim[RLIMIT_DATA].rlim_cur;
 		return current->signal->rlim[RLIMIT_DATA].rlim_cur;
 	case 4: /* UL_GDESLIM */
 	case 4: /* UL_GDESLIM */
-		return NR_OPEN;
+		return sysctl_nr_open;
 	}
 	}
 	return -EINVAL;
 	return -EINVAL;
 }
 }

+ 4 - 2
arch/sparc64/solaris/timod.c

@@ -859,7 +859,8 @@ asmlinkage int solaris_getmsg(unsigned int fd, u32 arg1, u32 arg2, u32 arg3)
 
 
 	SOLD("entry");
 	SOLD("entry");
 	lock_kernel();
 	lock_kernel();
-	if(fd >= NR_OPEN) goto out;
+	if (fd >= sysctl_nr_open)
+		goto out;
 
 
 	fdt = files_fdtable(current->files);
 	fdt = files_fdtable(current->files);
 	filp = fdt->fd[fd];
 	filp = fdt->fd[fd];
@@ -927,7 +928,8 @@ asmlinkage int solaris_putmsg(unsigned int fd, u32 arg1, u32 arg2, u32 arg3)
 
 
 	SOLD("entry");
 	SOLD("entry");
 	lock_kernel();
 	lock_kernel();
-	if(fd >= NR_OPEN) goto out;
+	if (fd >= sysctl_nr_open)
+		goto out;
 
 
 	fdt = files_fdtable(current->files);
 	fdt = files_fdtable(current->files);
 	filp = fdt->fd[fd];
 	filp = fdt->fd[fd];

+ 5 - 3
fs/file.c

@@ -24,6 +24,8 @@ struct fdtable_defer {
 	struct fdtable *next;
 	struct fdtable *next;
 };
 };
 
 
+int sysctl_nr_open __read_mostly = 1024*1024;
+
 /*
 /*
  * We use this list to defer free fdtables that have vmalloced
  * We use this list to defer free fdtables that have vmalloced
  * sets/arrays. By keeping a per-cpu list, we avoid having to embed
  * sets/arrays. By keeping a per-cpu list, we avoid having to embed
@@ -147,8 +149,8 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	nr /= (1024 / sizeof(struct file *));
 	nr /= (1024 / sizeof(struct file *));
 	nr = roundup_pow_of_two(nr + 1);
 	nr = roundup_pow_of_two(nr + 1);
 	nr *= (1024 / sizeof(struct file *));
 	nr *= (1024 / sizeof(struct file *));
-	if (nr > NR_OPEN)
-		nr = NR_OPEN;
+	if (nr > sysctl_nr_open)
+		nr = sysctl_nr_open;
 
 
 	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
 	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
 	if (!fdt)
 	if (!fdt)
@@ -233,7 +235,7 @@ int expand_files(struct files_struct *files, int nr)
 	if (nr < fdt->max_fds)
 	if (nr < fdt->max_fds)
 		return 0;
 		return 0;
 	/* Can we expand? */
 	/* Can we expand? */
-	if (nr >= NR_OPEN)
+	if (nr >= sysctl_nr_open)
 		return -EMFILE;
 		return -EMFILE;
 
 
 	/* All good, so we try */
 	/* All good, so we try */

+ 1 - 1
include/linux/fs.h

@@ -21,7 +21,7 @@
 
 
 /* Fixed constants first: */
 /* Fixed constants first: */
 #undef NR_OPEN
 #undef NR_OPEN
-#define NR_OPEN (1024*1024)	/* Absolute upper limit on fd num */
+extern int sysctl_nr_open;
 #define INR_OPEN 1024		/* Initial setting for nfile rlimits */
 #define INR_OPEN 1024		/* Initial setting for nfile rlimits */
 
 
 #define BLOCK_SIZE_BITS 10
 #define BLOCK_SIZE_BITS 10

+ 1 - 1
kernel/sys.c

@@ -1472,7 +1472,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
 	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
 	if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
 	    !capable(CAP_SYS_RESOURCE))
 	    !capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 		return -EPERM;
-	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN)
+	if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
 		return -EPERM;
 		return -EPERM;
 
 
 	retval = security_task_setrlimit(resource, &new_rlim);
 	retval = security_task_setrlimit(resource, &new_rlim);

+ 8 - 0
kernel/sysctl.c

@@ -1202,6 +1202,14 @@ static struct ctl_table fs_table[] = {
 		.mode		= 0644,
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 		.proc_handler	= &proc_dointvec,
 	},
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_open",
+		.data		= &sysctl_nr_open,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 	{
 		.ctl_name	= FS_DENTRY,
 		.ctl_name	= FS_DENTRY,
 		.procname	= "dentry-state",
 		.procname	= "dentry-state",