ソースを参照

Merge branch 'fixes' into for-linus

Russell King 12 年 前
コミット
b4874a3d29

+ 1 - 1
Makefile

@@ -1,7 +1,7 @@
 VERSION = 3
 PATCHLEVEL = 6
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
 NAME = Terrified Chipmunk
 
 # *DOCUMENTATION*

+ 1 - 0
arch/arm/include/asm/syscall.h

@@ -8,6 +8,7 @@
 #define _ASM_ARM_SYSCALL_H
 
 #include <linux/err.h>
+#include <linux/sched.h>
 
 #include <asm/unistd.h>
 

+ 0 - 9
arch/um/include/asm/processor-generic.h

@@ -20,14 +20,6 @@ struct mm_struct;
 
 struct thread_struct {
 	struct task_struct *saved_task;
-	/*
-	 * This flag is set to 1 before calling do_fork (and analyzed in
-	 * copy_thread) to mark that we are begin called from userspace (fork /
-	 * vfork / clone), and reset to 0 after. It is left to 0 when called
-	 * from kernelspace (i.e. kernel_thread() or fork_idle(),
-	 * as of 2.6.11).
-	 */
-	int forking;
 	struct pt_regs regs;
 	int singlestep_syscall;
 	void *fault_addr;
@@ -58,7 +50,6 @@ struct thread_struct {
 
 #define INIT_THREAD \
 { \
-	.forking		= 0, \
 	.regs		   	= EMPTY_REGS,	\
 	.fault_addr		= NULL, \
 	.prev_sched		= NULL, \

+ 0 - 10
arch/um/include/shared/common-offsets.h

@@ -7,16 +7,6 @@ DEFINE(UM_KERN_PAGE_MASK, PAGE_MASK);
 DEFINE(UM_KERN_PAGE_SHIFT, PAGE_SHIFT);
 DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC);
 
-DEFINE_STR(UM_KERN_EMERG, KERN_EMERG);
-DEFINE_STR(UM_KERN_ALERT, KERN_ALERT);
-DEFINE_STR(UM_KERN_CRIT, KERN_CRIT);
-DEFINE_STR(UM_KERN_ERR, KERN_ERR);
-DEFINE_STR(UM_KERN_WARNING, KERN_WARNING);
-DEFINE_STR(UM_KERN_NOTICE, KERN_NOTICE);
-DEFINE_STR(UM_KERN_INFO, KERN_INFO);
-DEFINE_STR(UM_KERN_DEBUG, KERN_DEBUG);
-DEFINE_STR(UM_KERN_CONT, KERN_CONT);
-
 DEFINE(UM_ELF_CLASS, ELF_CLASS);
 DEFINE(UM_ELFCLASS32, ELFCLASS32);
 DEFINE(UM_ELFCLASS64, ELFCLASS64);

+ 11 - 0
arch/um/include/shared/user.h

@@ -26,6 +26,17 @@
 extern void panic(const char *fmt, ...)
 	__attribute__ ((format (printf, 1, 2)));
 
+/* Requires preincluding include/linux/kern_levels.h */
+#define UM_KERN_EMERG	KERN_EMERG
+#define UM_KERN_ALERT	KERN_ALERT
+#define UM_KERN_CRIT	KERN_CRIT
+#define UM_KERN_ERR	KERN_ERR
+#define UM_KERN_WARNING	KERN_WARNING
+#define UM_KERN_NOTICE	KERN_NOTICE
+#define UM_KERN_INFO	KERN_INFO
+#define UM_KERN_DEBUG	KERN_DEBUG
+#define UM_KERN_CONT	KERN_CONT
+
 #ifdef UML_CONFIG_PRINTK
 extern int printk(const char *fmt, ...)
 	__attribute__ ((format (printf, 1, 2)));

+ 6 - 19
arch/um/kernel/exec.c

@@ -39,34 +39,21 @@ void flush_thread(void)
 
 void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp)
 {
+	get_safe_registers(regs->regs.gp, regs->regs.fp);
 	PT_REGS_IP(regs) = eip;
 	PT_REGS_SP(regs) = esp;
-}
-EXPORT_SYMBOL(start_thread);
-
-static long execve1(const char *file,
-		    const char __user *const __user *argv,
-		    const char __user *const __user *env)
-{
-	long error;
-
-	error = do_execve(file, argv, env, &current->thread.regs);
-	if (error == 0) {
-		task_lock(current);
-		current->ptrace &= ~PT_DTRACE;
+	current->ptrace &= ~PT_DTRACE;
 #ifdef SUBARCH_EXECVE1
-		SUBARCH_EXECVE1(&current->thread.regs.regs);
+	SUBARCH_EXECVE1(regs->regs);
 #endif
-		task_unlock(current);
-	}
-	return error;
 }
+EXPORT_SYMBOL(start_thread);
 
 long um_execve(const char *file, const char __user *const __user *argv, const char __user *const __user *env)
 {
 	long err;
 
-	err = execve1(file, argv, env);
+	err = do_execve(file, argv, env, &current->thread.regs);
 	if (!err)
 		UML_LONGJMP(current->thread.exec_buf, 1);
 	return err;
@@ -81,7 +68,7 @@ long sys_execve(const char __user *file, const char __user *const __user *argv,
 	filename = getname(file);
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename)) goto out;
-	error = execve1(filename, argv, env);
+	error = do_execve(filename, argv, env, &current->thread.regs);
 	putname(filename);
  out:
 	return error;

+ 4 - 4
arch/um/kernel/process.c

@@ -181,11 +181,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		struct pt_regs *regs)
 {
 	void (*handler)(void);
+	int kthread = current->flags & PF_KTHREAD;
 	int ret = 0;
 
 	p->thread = (struct thread_struct) INIT_THREAD;
 
-	if (current->thread.forking) {
+	if (!kthread) {
 	  	memcpy(&p->thread.regs.regs, &regs->regs,
 		       sizeof(p->thread.regs.regs));
 		PT_REGS_SET_SYSCALL_RETURN(&p->thread.regs, 0);
@@ -195,8 +196,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		handler = fork_handler;
 
 		arch_copy_thread(&current->thread.arch, &p->thread.arch);
-	}
-	else {
+	} else {
 		get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp);
 		p->thread.request.u.thread = current->thread.request.u.thread;
 		handler = new_thread_handler;
@@ -204,7 +204,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
 	new_thread(task_stack_page(p), &p->thread.switch_buf, handler);
 
-	if (current->thread.forking) {
+	if (!kthread) {
 		clear_flushed_tls(p);
 
 		/*

+ 5 - 1
arch/um/kernel/signal.c

@@ -22,9 +22,13 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr,
 			 struct k_sigaction *ka, siginfo_t *info)
 {
 	sigset_t *oldset = sigmask_to_save();
+	int singlestep = 0;
 	unsigned long sp;
 	int err;
 
+	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
+		singlestep = 1;
+
 	/* Did we come from a system call? */
 	if (PT_REGS_SYSCALL_NR(regs) >= 0) {
 		/* If so, check system call restarting.. */
@@ -61,7 +65,7 @@ static void handle_signal(struct pt_regs *regs, unsigned long signr,
 	if (err)
 		force_sigsegv(signr, current);
 	else
-		signal_delivered(signr, info, ka, regs, 0);
+		signal_delivered(signr, info, ka, regs, singlestep);
 }
 
 static int kern_do_signal(struct pt_regs *regs)

+ 12 - 12
arch/um/kernel/syscall.c

@@ -17,25 +17,25 @@
 
 long sys_fork(void)
 {
-	long ret;
-
-	current->thread.forking = 1;
-	ret = do_fork(SIGCHLD, UPT_SP(&current->thread.regs.regs),
+	return do_fork(SIGCHLD, UPT_SP(&current->thread.regs.regs),
 		      &current->thread.regs, 0, NULL, NULL);
-	current->thread.forking = 0;
-	return ret;
 }
 
 long sys_vfork(void)
 {
-	long ret;
-
-	current->thread.forking = 1;
-	ret = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD,
+	return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD,
 		      UPT_SP(&current->thread.regs.regs),
 		      &current->thread.regs, 0, NULL, NULL);
-	current->thread.forking = 0;
-	return ret;
+}
+
+long sys_clone(unsigned long clone_flags, unsigned long newsp,
+	       void __user *parent_tid, void __user *child_tid)
+{
+	if (!newsp)
+		newsp = UPT_SP(&current->thread.regs.regs);
+
+	return do_fork(clone_flags, newsp, &current->thread.regs, 0, parent_tid,
+		      child_tid);
 }
 
 long old_mmap(unsigned long addr, unsigned long len,

+ 1 - 1
arch/um/scripts/Makefile.rules

@@ -8,7 +8,7 @@ USER_OBJS += $(filter %_user.o,$(obj-y) $(obj-m)  $(USER_SINGLE_OBJS))
 USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file))
 
 $(USER_OBJS:.o=.%): \
-	c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) -include user.h $(CFLAGS_$(basetarget).o)
+	c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) -include $(srctree)/include/linux/kern_levels.h -include user.h $(CFLAGS_$(basetarget).o)
 
 # These are like USER_OBJS but filter USER_CFLAGS through unprofile instead of
 # using it directly.

+ 1 - 0
arch/x86/um/Kconfig

@@ -21,6 +21,7 @@ config 64BIT
 config X86_32
 	def_bool !64BIT
 	select HAVE_AOUT
+	select ARCH_WANT_IPC_PARSE_VERSION
 
 config X86_64
 	def_bool 64BIT

+ 0 - 3
arch/x86/um/shared/sysdep/kernel-offsets.h

@@ -7,9 +7,6 @@
 #define DEFINE(sym, val) \
 	asm volatile("\n->" #sym " %0 " #val : : "i" (val))
 
-#define STR(x) #x
-#define DEFINE_STR(sym, val) asm volatile("\n->" #sym " " STR(val) " " #val: : )
-
 #define BLANK() asm volatile("\n->" : : )
 
 #define OFFSET(sym, str, mem) \

+ 2 - 0
arch/x86/um/shared/sysdep/syscalls.h

@@ -1,3 +1,5 @@
+extern long sys_clone(unsigned long clone_flags, unsigned long newsp,
+	       void __user *parent_tid, void __user *child_tid);
 #ifdef __i386__
 #include "syscalls_32.h"
 #else

+ 0 - 6
arch/x86/um/signal.c

@@ -416,9 +416,6 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
 	PT_REGS_AX(regs) = (unsigned long) sig;
 	PT_REGS_DX(regs) = (unsigned long) 0;
 	PT_REGS_CX(regs) = (unsigned long) 0;
-
-	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
-		ptrace_notify(SIGTRAP);
 	return 0;
 }
 
@@ -466,9 +463,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
 	PT_REGS_AX(regs) = (unsigned long) sig;
 	PT_REGS_DX(regs) = (unsigned long) &frame->info;
 	PT_REGS_CX(regs) = (unsigned long) &frame->uc;
-
-	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED))
-		ptrace_notify(SIGTRAP);
 	return 0;
 }
 

+ 1 - 1
arch/x86/um/sys_call_table_32.c

@@ -28,7 +28,7 @@
 #define ptregs_execve sys_execve
 #define ptregs_iopl sys_iopl
 #define ptregs_vm86old sys_vm86old
-#define ptregs_clone sys_clone
+#define ptregs_clone i386_clone
 #define ptregs_vm86 sys_vm86
 #define ptregs_sigaltstack sys_sigaltstack
 #define ptregs_vfork sys_vfork

+ 7 - 20
arch/x86/um/syscalls_32.c

@@ -3,37 +3,24 @@
  * Licensed under the GPL
  */
 
-#include "linux/sched.h"
-#include "linux/shm.h"
-#include "linux/ipc.h"
-#include "linux/syscalls.h"
-#include "asm/mman.h"
-#include "asm/uaccess.h"
-#include "asm/unistd.h"
+#include <linux/syscalls.h>
+#include <sysdep/syscalls.h>
 
 /*
  * The prototype on i386 is:
  *
- *     int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls, int * child_tidptr)
+ *     int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls
  *
  * and the "newtls" arg. on i386 is read by copy_thread directly from the
  * register saved on the stack.
  */
-long sys_clone(unsigned long clone_flags, unsigned long newsp,
-	       int __user *parent_tid, void *newtls, int __user *child_tid)
+long i386_clone(unsigned long clone_flags, unsigned long newsp,
+		int __user *parent_tid, void *newtls, int __user *child_tid)
 {
-	long ret;
-
-	if (!newsp)
-		newsp = UPT_SP(&current->thread.regs.regs);
-
-	current->thread.forking = 1;
-	ret = do_fork(clone_flags, newsp, &current->thread.regs, 0, parent_tid,
-		      child_tid);
-	current->thread.forking = 0;
-	return ret;
+	return sys_clone(clone_flags, newsp, parent_tid, child_tid);
 }
 
+
 long sys_sigaction(int sig, const struct old_sigaction __user *act,
 			 struct old_sigaction __user *oact)
 {

+ 3 - 20
arch/x86/um/syscalls_64.c

@@ -5,12 +5,9 @@
  * Licensed under the GPL
  */
 
-#include "linux/linkage.h"
-#include "linux/personality.h"
-#include "linux/utsname.h"
-#include "asm/prctl.h" /* XXX This should get the constants from libc */
-#include "asm/uaccess.h"
-#include "os.h"
+#include <linux/sched.h>
+#include <asm/prctl.h> /* XXX This should get the constants from libc */
+#include <os.h>
 
 long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
 {
@@ -79,20 +76,6 @@ long sys_arch_prctl(int code, unsigned long addr)
 	return arch_prctl(current, code, (unsigned long __user *) addr);
 }
 
-long sys_clone(unsigned long clone_flags, unsigned long newsp,
-	       void __user *parent_tid, void __user *child_tid)
-{
-	long ret;
-
-	if (!newsp)
-		newsp = UPT_SP(&current->thread.regs.regs);
-	current->thread.forking = 1;
-	ret = do_fork(clone_flags, newsp, &current->thread.regs, 0, parent_tid,
-		      child_tid);
-	current->thread.forking = 0;
-	return ret;
-}
-
 void arch_switch_to(struct task_struct *to)
 {
 	if ((to->thread.arch.fs == 0) || (to->mm == NULL))

+ 103 - 50
drivers/block/nvme.c

@@ -79,6 +79,7 @@ struct nvme_dev {
 	char serial[20];
 	char model[40];
 	char firmware_rev[8];
+	u32 max_hw_sectors;
 };
 
 /*
@@ -835,15 +836,15 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
 }
 
 static int nvme_get_features(struct nvme_dev *dev, unsigned fid,
-				unsigned dword11, dma_addr_t dma_addr)
+				unsigned nsid, dma_addr_t dma_addr)
 {
 	struct nvme_command c;
 
 	memset(&c, 0, sizeof(c));
 	c.features.opcode = nvme_admin_get_features;
+	c.features.nsid = cpu_to_le32(nsid);
 	c.features.prp1 = cpu_to_le64(dma_addr);
 	c.features.fid = cpu_to_le32(fid);
-	c.features.dword11 = cpu_to_le32(dword11);
 
 	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
@@ -862,11 +863,51 @@ static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
 	return nvme_submit_admin_cmd(dev, &c, result);
 }
 
+/**
+ * nvme_cancel_ios - Cancel outstanding I/Os
+ * @queue: The queue to cancel I/Os on
+ * @timeout: True to only cancel I/Os which have timed out
+ */
+static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
+{
+	int depth = nvmeq->q_depth - 1;
+	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
+	unsigned long now = jiffies;
+	int cmdid;
+
+	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
+		void *ctx;
+		nvme_completion_fn fn;
+		static struct nvme_completion cqe = {
+			.status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1,
+		};
+
+		if (timeout && !time_after(now, info[cmdid].timeout))
+			continue;
+		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid);
+		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
+		fn(nvmeq->dev, ctx, &cqe);
+	}
+}
+
+static void nvme_free_queue_mem(struct nvme_queue *nvmeq)
+{
+	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	kfree(nvmeq);
+}
+
 static void nvme_free_queue(struct nvme_dev *dev, int qid)
 {
 	struct nvme_queue *nvmeq = dev->queues[qid];
 	int vector = dev->entry[nvmeq->cq_vector].vector;
 
+	spin_lock_irq(&nvmeq->q_lock);
+	nvme_cancel_ios(nvmeq, false);
+	spin_unlock_irq(&nvmeq->q_lock);
+
 	irq_set_affinity_hint(vector, NULL);
 	free_irq(vector, nvmeq);
 
@@ -876,18 +917,15 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid)
 		adapter_delete_cq(dev, qid);
 	}
 
-	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
-				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
-	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
-					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
-	kfree(nvmeq);
+	nvme_free_queue_mem(nvmeq);
 }
 
 static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
 							int depth, int vector)
 {
 	struct device *dmadev = &dev->pci_dev->dev;
-	unsigned extra = (depth / 8) + (depth * sizeof(struct nvme_cmd_info));
+	unsigned extra = DIV_ROUND_UP(depth, 8) + (depth *
+						sizeof(struct nvme_cmd_info));
 	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
 	if (!nvmeq)
 		return NULL;
@@ -975,7 +1013,7 @@ static __devinit struct nvme_queue *nvme_create_queue(struct nvme_dev *dev,
 
 static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 {
-	int result;
+	int result = 0;
 	u32 aqa;
 	u64 cap;
 	unsigned long timeout;
@@ -1005,17 +1043,22 @@ static int __devinit nvme_configure_admin_queue(struct nvme_dev *dev)
 	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
 	dev->db_stride = NVME_CAP_STRIDE(cap);
 
-	while (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
+	while (!result && !(readl(&dev->bar->csts) & NVME_CSTS_RDY)) {
 		msleep(100);
 		if (fatal_signal_pending(current))
-			return -EINTR;
+			result = -EINTR;
 		if (time_after(jiffies, timeout)) {
 			dev_err(&dev->pci_dev->dev,
 				"Device not ready; aborting initialisation\n");
-			return -ENODEV;
+			result = -ENODEV;
 		}
 	}
 
+	if (result) {
+		nvme_free_queue_mem(nvmeq);
+		return result;
+	}
+
 	result = queue_request_irq(dev, nvmeq, "nvme admin");
 	dev->queues[0] = nvmeq;
 	return result;
@@ -1037,6 +1080,8 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 	offset = offset_in_page(addr);
 	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
 	pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
 
 	err = get_user_pages_fast(addr, count, 1, pages);
 	if (err < count) {
@@ -1146,14 +1191,13 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	return status;
 }
 
-static int nvme_user_admin_cmd(struct nvme_ns *ns,
+static int nvme_user_admin_cmd(struct nvme_dev *dev,
 					struct nvme_admin_cmd __user *ucmd)
 {
-	struct nvme_dev *dev = ns->dev;
 	struct nvme_admin_cmd cmd;
 	struct nvme_command c;
 	int status, length;
-	struct nvme_iod *iod;
+	struct nvme_iod *uninitialized_var(iod);
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -1204,7 +1248,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	case NVME_IOCTL_ID:
 		return ns->ns_id;
 	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_admin_cmd(ns, (void __user *)arg);
+		return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, (void __user *)arg);
 	default:
@@ -1218,26 +1262,6 @@ static const struct block_device_operations nvme_fops = {
 	.compat_ioctl	= nvme_ioctl,
 };
 
-static void nvme_timeout_ios(struct nvme_queue *nvmeq)
-{
-	int depth = nvmeq->q_depth - 1;
-	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
-	unsigned long now = jiffies;
-	int cmdid;
-
-	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
-		void *ctx;
-		nvme_completion_fn fn;
-		static struct nvme_completion cqe = { .status = cpu_to_le16(NVME_SC_ABORT_REQ) << 1, };
-
-		if (!time_after(now, info[cmdid].timeout))
-			continue;
-		dev_warn(nvmeq->q_dmadev, "Timing out I/O %d\n", cmdid);
-		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
-		fn(nvmeq->dev, ctx, &cqe);
-	}
-}
-
 static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
 {
 	while (bio_list_peek(&nvmeq->sq_cong)) {
@@ -1269,7 +1293,7 @@ static int nvme_kthread(void *data)
 				spin_lock_irq(&nvmeq->q_lock);
 				if (nvme_process_cq(nvmeq))
 					printk("process_cq did something\n");
-				nvme_timeout_ios(nvmeq);
+				nvme_cancel_ios(nvmeq, true);
 				nvme_resubmit_bios(nvmeq);
 				spin_unlock_irq(&nvmeq->q_lock);
 			}
@@ -1339,6 +1363,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 	ns->disk = disk;
 	lbaf = id->flbas & 0xf;
 	ns->lba_shift = id->lbaf[lbaf].ds;
+	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+	if (dev->max_hw_sectors)
+		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
 
 	disk->major = nvme_major;
 	disk->minors = NVME_MINORS;
@@ -1383,7 +1410,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 
 static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 {
-	int result, cpu, i, nr_io_queues, db_bar_size;
+	int result, cpu, i, nr_io_queues, db_bar_size, q_depth;
 
 	nr_io_queues = num_online_cpus();
 	result = set_queue_count(dev, nr_io_queues);
@@ -1429,9 +1456,10 @@ static int __devinit nvme_setup_io_queues(struct nvme_dev *dev)
 		cpu = cpumask_next(cpu, cpu_online_mask);
 	}
 
+	q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
+								NVME_Q_DEPTH);
 	for (i = 0; i < nr_io_queues; i++) {
-		dev->queues[i + 1] = nvme_create_queue(dev, i + 1,
-							NVME_Q_DEPTH, i);
+		dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i);
 		if (IS_ERR(dev->queues[i + 1]))
 			return PTR_ERR(dev->queues[i + 1]);
 		dev->queue_count++;
@@ -1480,6 +1508,10 @@ static int __devinit nvme_dev_add(struct nvme_dev *dev)
 	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
 	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
 	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
+	if (ctrl->mdts) {
+		int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
+		dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
+	}
 
 	id_ns = mem;
 	for (i = 1; i <= nn; i++) {
@@ -1523,8 +1555,6 @@ static int nvme_dev_remove(struct nvme_dev *dev)
 	list_del(&dev->node);
 	spin_unlock(&dev_list_lock);
 
-	/* TODO: wait all I/O finished or cancel them */
-
 	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
 		list_del(&ns->list);
 		del_gendisk(ns->disk);
@@ -1560,15 +1590,33 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
 	dma_pool_destroy(dev->prp_small_pool);
 }
 
-/* XXX: Use an ida or something to let remove / add work correctly */
-static void nvme_set_instance(struct nvme_dev *dev)
+static DEFINE_IDA(nvme_instance_ida);
+
+static int nvme_set_instance(struct nvme_dev *dev)
 {
-	static int instance;
-	dev->instance = instance++;
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&dev_list_lock);
+		error = ida_get_new(&nvme_instance_ida, &instance);
+		spin_unlock(&dev_list_lock);
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	dev->instance = instance;
+	return 0;
 }
 
 static void nvme_release_instance(struct nvme_dev *dev)
 {
+	spin_lock(&dev_list_lock);
+	ida_remove(&nvme_instance_ida, dev->instance);
+	spin_unlock(&dev_list_lock);
 }
 
 static int __devinit nvme_probe(struct pci_dev *pdev,
@@ -1601,7 +1649,10 @@ static int __devinit nvme_probe(struct pci_dev *pdev,
 	pci_set_drvdata(pdev, dev);
 	dma_set_mask(&pdev->dev, DMA_BIT_MASK(64));
 	dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64));
-	nvme_set_instance(dev);
+	result = nvme_set_instance(dev);
+	if (result)
+		goto disable;
+
 	dev->entry[0].vector = pdev->irq;
 
 	result = nvme_setup_prp_pools(dev);
@@ -1704,15 +1755,17 @@ static struct pci_driver nvme_driver = {
 
 static int __init nvme_init(void)
 {
-	int result = -EBUSY;
+	int result;
 
 	nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
 	if (IS_ERR(nvme_thread))
 		return PTR_ERR(nvme_thread);
 
-	nvme_major = register_blkdev(nvme_major, "nvme");
-	if (nvme_major <= 0)
+	result = register_blkdev(nvme_major, "nvme");
+	if (result < 0)
 		goto kill_kthread;
+	else if (result > 0)
+		nvme_major = result;
 
 	result = pci_register_driver(&nvme_driver);
 	if (result)

+ 4 - 2
drivers/iommu/amd_iommu.c

@@ -266,7 +266,7 @@ static void swap_pci_ref(struct pci_dev **from, struct pci_dev *to)
 
 static int iommu_init_device(struct device *dev)
 {
-	struct pci_dev *dma_pdev, *pdev = to_pci_dev(dev);
+	struct pci_dev *dma_pdev = NULL, *pdev = to_pci_dev(dev);
 	struct iommu_dev_data *dev_data;
 	struct iommu_group *group;
 	u16 alias;
@@ -293,7 +293,9 @@ static int iommu_init_device(struct device *dev)
 		dev_data->alias_data = alias_data;
 
 		dma_pdev = pci_get_bus_and_slot(alias >> 8, alias & 0xff);
-	} else
+	}
+
+	if (dma_pdev == NULL)
 		dma_pdev = pci_dev_get(pdev);
 
 	/* Account for quirked devices */

+ 7 - 4
drivers/md/dm-mpath.c

@@ -1555,6 +1555,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 			   unsigned long arg)
 {
 	struct multipath *m = ti->private;
+	struct pgpath *pgpath;
 	struct block_device *bdev;
 	fmode_t mode;
 	unsigned long flags;
@@ -1570,12 +1571,14 @@ again:
 	if (!m->current_pgpath)
 		__choose_pgpath(m, 0);
 
-	if (m->current_pgpath) {
-		bdev = m->current_pgpath->path.dev->bdev;
-		mode = m->current_pgpath->path.dev->mode;
+	pgpath = m->current_pgpath;
+
+	if (pgpath) {
+		bdev = pgpath->path.dev->bdev;
+		mode = pgpath->path.dev->mode;
 	}
 
-	if (m->queue_io)
+	if ((pgpath && m->queue_io) || (!pgpath && m->queue_if_no_path))
 		r = -EAGAIN;
 	else if (!bdev)
 		r = -EIO;

+ 57 - 4
drivers/md/dm-table.c

@@ -1212,6 +1212,41 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 
+static int count_device(struct dm_target *ti, struct dm_dev *dev,
+			sector_t start, sector_t len, void *data)
+{
+	unsigned *num_devices = data;
+
+	(*num_devices)++;
+
+	return 0;
+}
+
+/*
+ * Check whether a table has no data devices attached using each
+ * target's iterate_devices method.
+ * Returns false if the result is unknown because a target doesn't
+ * support iterate_devices.
+ */
+bool dm_table_has_no_data_devices(struct dm_table *table)
+{
+	struct dm_target *uninitialized_var(ti);
+	unsigned i = 0, num_devices = 0;
+
+	while (i < dm_table_get_num_targets(table)) {
+		ti = dm_table_get_target(table, i++);
+
+		if (!ti->type->iterate_devices)
+			return false;
+
+		ti->type->iterate_devices(ti, count_device, &num_devices);
+		if (num_devices)
+			return false;
+	}
+
+	return true;
+}
+
 /*
  * Establish the new table's queue_limits and validate them.
  */
@@ -1354,17 +1389,25 @@ static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
 	return q && blk_queue_nonrot(q);
 }
 
-static bool dm_table_is_nonrot(struct dm_table *t)
+static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
+			     sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && !blk_queue_add_random(q);
+}
+
+static bool dm_table_all_devices_attribute(struct dm_table *t,
+					   iterate_devices_callout_fn func)
 {
 	struct dm_target *ti;
 	unsigned i = 0;
 
-	/* Ensure that all underlying device are non-rotational. */
 	while (i < dm_table_get_num_targets(t)) {
 		ti = dm_table_get_target(t, i++);
 
 		if (!ti->type->iterate_devices ||
-		    !ti->type->iterate_devices(ti, device_is_nonrot, NULL))
+		    !ti->type->iterate_devices(ti, func, NULL))
 			return 0;
 	}
 
@@ -1396,13 +1439,23 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	if (!dm_table_discard_zeroes_data(t))
 		q->limits.discard_zeroes_data = 0;
 
-	if (dm_table_is_nonrot(t))
+	/* Ensure that all underlying devices are non-rotational. */
+	if (dm_table_all_devices_attribute(t, device_is_nonrot))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
 	else
 		queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q);
 
 	dm_table_set_integrity(t);
 
+	/*
+	 * Determine whether or not this queue's I/O timings contribute
+	 * to the entropy pool, Only request-based targets use this.
+	 * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
+	 * have it set.
+	 */
+	if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random))
+		queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
+
 	/*
 	 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
 	 * visible to other CPUs because, once the flag is set, incoming bios

+ 88 - 47
drivers/md/dm-thin.c

@@ -509,9 +509,9 @@ enum pool_mode {
 struct pool_features {
 	enum pool_mode mode;
 
-	unsigned zero_new_blocks:1;
-	unsigned discard_enabled:1;
-	unsigned discard_passdown:1;
+	bool zero_new_blocks:1;
+	bool discard_enabled:1;
+	bool discard_passdown:1;
 };
 
 struct thin_c;
@@ -580,7 +580,8 @@ struct pool_c {
 	struct dm_target_callbacks callbacks;
 
 	dm_block_t low_water_blocks;
-	struct pool_features pf;
+	struct pool_features requested_pf; /* Features requested during table load */
+	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
 };
 
 /*
@@ -1839,6 +1840,47 @@ static void __requeue_bios(struct pool *pool)
 /*----------------------------------------------------------------
  * Binding of control targets to a pool object
  *--------------------------------------------------------------*/
+static bool data_dev_supports_discard(struct pool_c *pt)
+{
+	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
+
+	return q && blk_queue_discard(q);
+}
+
+/*
+ * If discard_passdown was enabled verify that the data device
+ * supports discards.  Disable discard_passdown if not.
+ */
+static void disable_passdown_if_not_supported(struct pool_c *pt)
+{
+	struct pool *pool = pt->pool;
+	struct block_device *data_bdev = pt->data_dev->bdev;
+	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
+	sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
+	const char *reason = NULL;
+	char buf[BDEVNAME_SIZE];
+
+	if (!pt->adjusted_pf.discard_passdown)
+		return;
+
+	if (!data_dev_supports_discard(pt))
+		reason = "discard unsupported";
+
+	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
+		reason = "max discard sectors smaller than a block";
+
+	else if (data_limits->discard_granularity > block_size)
+		reason = "discard granularity larger than a block";
+
+	else if (block_size & (data_limits->discard_granularity - 1))
+		reason = "discard granularity not a factor of block size";
+
+	if (reason) {
+		DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
+		pt->adjusted_pf.discard_passdown = false;
+	}
+}
+
 static int bind_control_target(struct pool *pool, struct dm_target *ti)
 {
 	struct pool_c *pt = ti->private;
@@ -1847,31 +1889,16 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 	 * We want to make sure that degraded pools are never upgraded.
 	 */
 	enum pool_mode old_mode = pool->pf.mode;
-	enum pool_mode new_mode = pt->pf.mode;
+	enum pool_mode new_mode = pt->adjusted_pf.mode;
 
 	if (old_mode > new_mode)
 		new_mode = old_mode;
 
 	pool->ti = ti;
 	pool->low_water_blocks = pt->low_water_blocks;
-	pool->pf = pt->pf;
-	set_pool_mode(pool, new_mode);
+	pool->pf = pt->adjusted_pf;
 
-	/*
-	 * If discard_passdown was enabled verify that the data device
-	 * supports discards.  Disable discard_passdown if not; otherwise
-	 * -EOPNOTSUPP will be returned.
-	 */
-	/* FIXME: pull this out into a sep fn. */
-	if (pt->pf.discard_passdown) {
-		struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
-		if (!q || !blk_queue_discard(q)) {
-			char buf[BDEVNAME_SIZE];
-			DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.",
-			       bdevname(pt->data_dev->bdev, buf));
-			pool->pf.discard_passdown = 0;
-		}
-	}
+	set_pool_mode(pool, new_mode);
 
 	return 0;
 }
@@ -1889,9 +1916,9 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
 static void pool_features_init(struct pool_features *pf)
 {
 	pf->mode = PM_WRITE;
-	pf->zero_new_blocks = 1;
-	pf->discard_enabled = 1;
-	pf->discard_passdown = 1;
+	pf->zero_new_blocks = true;
+	pf->discard_enabled = true;
+	pf->discard_passdown = true;
 }
 
 static void __pool_destroy(struct pool *pool)
@@ -2119,13 +2146,13 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 		argc--;
 
 		if (!strcasecmp(arg_name, "skip_block_zeroing"))
-			pf->zero_new_blocks = 0;
+			pf->zero_new_blocks = false;
 
 		else if (!strcasecmp(arg_name, "ignore_discard"))
-			pf->discard_enabled = 0;
+			pf->discard_enabled = false;
 
 		else if (!strcasecmp(arg_name, "no_discard_passdown"))
-			pf->discard_passdown = 0;
+			pf->discard_passdown = false;
 
 		else if (!strcasecmp(arg_name, "read_only"))
 			pf->mode = PM_READ_ONLY;
@@ -2259,8 +2286,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	pt->metadata_dev = metadata_dev;
 	pt->data_dev = data_dev;
 	pt->low_water_blocks = low_water_blocks;
-	pt->pf = pf;
+	pt->adjusted_pf = pt->requested_pf = pf;
 	ti->num_flush_requests = 1;
+
 	/*
 	 * Only need to enable discards if the pool should pass
 	 * them down to the data device.  The thin device's discard
@@ -2268,12 +2296,14 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 */
 	if (pf.discard_enabled && pf.discard_passdown) {
 		ti->num_discard_requests = 1;
+
 		/*
 		 * Setting 'discards_supported' circumvents the normal
 		 * stacking of discard limits (this keeps the pool and
 		 * thin devices' discard limits consistent).
 		 */
 		ti->discards_supported = true;
+		ti->discard_zeroes_data_unsupported = true;
 	}
 	ti->private = pt;
 
@@ -2703,7 +2733,7 @@ static int pool_status(struct dm_target *ti, status_type_t type,
 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
 		       (unsigned long)pool->sectors_per_block,
 		       (unsigned long long)pt->low_water_blocks);
-		emit_flags(&pt->pf, result, sz, maxlen);
+		emit_flags(&pt->requested_pf, result, sz, maxlen);
 		break;
 	}
 
@@ -2732,20 +2762,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
-static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
 {
-	/*
-	 * FIXME: these limits may be incompatible with the pool's data device
-	 */
+	struct pool *pool = pt->pool;
+	struct queue_limits *data_limits;
+
 	limits->max_discard_sectors = pool->sectors_per_block;
 
 	/*
-	 * This is just a hint, and not enforced.  We have to cope with
-	 * bios that cover a block partially.  A discard that spans a block
-	 * boundary is not sent to this target.
+	 * discard_granularity is just a hint, and not enforced.
 	 */
-	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-	limits->discard_zeroes_data = pool->pf.zero_new_blocks;
+	if (pt->adjusted_pf.discard_passdown) {
+		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
+		limits->discard_granularity = data_limits->discard_granularity;
+	} else
+		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
 }
 
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -2755,15 +2786,25 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 	blk_limits_io_min(limits, 0);
 	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
-	if (pool->pf.discard_enabled)
-		set_discard_limits(pool, limits);
+
+	/*
+	 * pt->adjusted_pf is a staging area for the actual features to use.
+	 * They get transferred to the live pool in bind_control_target()
+	 * called from pool_preresume().
+	 */
+	if (!pt->adjusted_pf.discard_enabled)
+		return;
+
+	disable_passdown_if_not_supported(pt);
+
+	set_discard_limits(pt, limits);
 }
 
 static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 3, 0},
+	.version = {1, 4, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -3042,19 +3083,19 @@ static int thin_iterate_devices(struct dm_target *ti,
 	return 0;
 }
 
+/*
+ * A thin device always inherits its queue limits from its pool.
+ */
 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct thin_c *tc = ti->private;
-	struct pool *pool = tc->pool;
 
-	blk_limits_io_min(limits, 0);
-	blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
-	set_discard_limits(pool, limits);
+	*limits = bdev_get_queue(tc->pool_dev->bdev)->limits;
 }
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 3, 0},
+	.version = {1, 4, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,

+ 4 - 4
drivers/md/dm-verity.c

@@ -718,8 +718,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	v->hash_dev_block_bits = ffs(num) - 1;
 
 	if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
-	    num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) !=
-	    (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) {
+	    (sector_t)(num_ll << (v->data_dev_block_bits - SECTOR_SHIFT))
+	    >> (v->data_dev_block_bits - SECTOR_SHIFT) != num_ll) {
 		ti->error = "Invalid data blocks";
 		r = -EINVAL;
 		goto bad;
@@ -733,8 +733,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
-	    num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) !=
-	    (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) {
+	    (sector_t)(num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT))
+	    >> (v->hash_dev_block_bits - SECTOR_SHIFT) != num_ll) {
 		ti->error = "Invalid hash start";
 		r = -EINVAL;
 		goto bad;

+ 52 - 19
drivers/md/dm.c

@@ -865,10 +865,14 @@ static void dm_done(struct request *clone, int error, bool mapped)
 {
 	int r = error;
 	struct dm_rq_target_io *tio = clone->end_io_data;
-	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
+	dm_request_endio_fn rq_end_io = NULL;
 
-	if (mapped && rq_end_io)
-		r = rq_end_io(tio->ti, clone, error, &tio->info);
+	if (tio->ti) {
+		rq_end_io = tio->ti->type->rq_end_io;
+
+		if (mapped && rq_end_io)
+			r = rq_end_io(tio->ti, clone, error, &tio->info);
+	}
 
 	if (r <= 0)
 		/* The target wants to complete the I/O */
@@ -1588,15 +1592,6 @@ static int map_request(struct dm_target *ti, struct request *clone,
 	int r, requeued = 0;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 
-	/*
-	 * Hold the md reference here for the in-flight I/O.
-	 * We can't rely on the reference count by device opener,
-	 * because the device may be closed during the request completion
-	 * when all bios are completed.
-	 * See the comment in rq_completed() too.
-	 */
-	dm_get(md);
-
 	tio->ti = ti;
 	r = ti->type->map_rq(ti, clone, &tio->info);
 	switch (r) {
@@ -1628,6 +1623,26 @@ static int map_request(struct dm_target *ti, struct request *clone,
 	return requeued;
 }
 
+static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
+{
+	struct request *clone;
+
+	blk_start_request(orig);
+	clone = orig->special;
+	atomic_inc(&md->pending[rq_data_dir(clone)]);
+
+	/*
+	 * Hold the md reference here for the in-flight I/O.
+	 * We can't rely on the reference count by device opener,
+	 * because the device may be closed during the request completion
+	 * when all bios are completed.
+	 * See the comment in rq_completed() too.
+	 */
+	dm_get(md);
+
+	return clone;
+}
+
 /*
  * q->request_fn for request-based dm.
  * Called with the queue lock held.
@@ -1657,14 +1672,21 @@ static void dm_request_fn(struct request_queue *q)
 			pos = blk_rq_pos(rq);
 
 		ti = dm_table_find_target(map, pos);
-		BUG_ON(!dm_target_is_valid(ti));
+		if (!dm_target_is_valid(ti)) {
+			/*
+			 * Must perform setup, that dm_done() requires,
+			 * before calling dm_kill_unmapped_request
+			 */
+			DMERR_LIMIT("request attempted access beyond the end of device");
+			clone = dm_start_request(md, rq);
+			dm_kill_unmapped_request(clone, -EIO);
+			continue;
+		}
 
 		if (ti->type->busy && ti->type->busy(ti))
 			goto delay_and_out;
 
-		blk_start_request(rq);
-		clone = rq->special;
-		atomic_inc(&md->pending[rq_data_dir(clone)]);
+		clone = dm_start_request(md, rq);
 
 		spin_unlock(q->queue_lock);
 		if (map_request(ti, clone, md))
@@ -1684,8 +1706,6 @@ delay_and_out:
 	blk_delay_queue(q, HZ / 10);
 out:
 	dm_table_put(map);
-
-	return;
 }
 
 int dm_underlying_device_busy(struct request_queue *q)
@@ -2409,7 +2429,7 @@ static void dm_queue_flush(struct mapped_device *md)
  */
 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
-	struct dm_table *map = ERR_PTR(-EINVAL);
+	struct dm_table *live_map, *map = ERR_PTR(-EINVAL);
 	struct queue_limits limits;
 	int r;
 
@@ -2419,6 +2439,19 @@ struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	if (!dm_suspended_md(md))
 		goto out;
 
+	/*
+	 * If the new table has no data devices, retain the existing limits.
+	 * This helps multipath with queue_if_no_path if all paths disappear,
+	 * then new I/O is queued based on these limits, and then some paths
+	 * reappear.
+	 */
+	if (dm_table_has_no_data_devices(table)) {
+		live_map = dm_get_live_table(md);
+		if (live_map)
+			limits = md->queue->limits;
+		dm_table_put(live_map);
+	}
+
 	r = dm_calculate_queue_limits(table, &limits);
 	if (r) {
 		map = ERR_PTR(r);

+ 1 - 0
drivers/md/dm.h

@@ -54,6 +54,7 @@ void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
+bool dm_table_has_no_data_devices(struct dm_table *table);
 int dm_calculate_queue_limits(struct dm_table *table,
 			      struct queue_limits *limits);
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,

+ 42 - 6
drivers/mtd/mtdchar.c

@@ -1123,6 +1123,33 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file,
 }
 #endif
 
+static inline unsigned long get_vm_size(struct vm_area_struct *vma)
+{
+	return vma->vm_end - vma->vm_start;
+}
+
+static inline resource_size_t get_vm_offset(struct vm_area_struct *vma)
+{
+	return (resource_size_t) vma->vm_pgoff << PAGE_SHIFT;
+}
+
+/*
+ * Set a new vm offset.
+ *
+ * Verify that the incoming offset really works as a page offset,
+ * and that the offset and size fit in a resource_size_t.
+ */
+static inline int set_vm_offset(struct vm_area_struct *vma, resource_size_t off)
+{
+	pgoff_t pgoff = off >> PAGE_SHIFT;
+	if (off != (resource_size_t) pgoff << PAGE_SHIFT)
+		return -EINVAL;
+	if (off + get_vm_size(vma) - 1 < off)
+		return -EINVAL;
+	vma->vm_pgoff = pgoff;
+	return 0;
+}
+
 /*
  * set up a mapping for shared memory segments
  */
@@ -1132,20 +1159,29 @@ static int mtdchar_mmap(struct file *file, struct vm_area_struct *vma)
 	struct mtd_file_info *mfi = file->private_data;
 	struct mtd_info *mtd = mfi->mtd;
 	struct map_info *map = mtd->priv;
-	unsigned long start;
-	unsigned long off;
-	u32 len;
+	resource_size_t start, off;
+	unsigned long len, vma_len;
 
 	if (mtd->type == MTD_RAM || mtd->type == MTD_ROM) {
-		off = vma->vm_pgoff << PAGE_SHIFT;
+		off = get_vm_offset(vma);
 		start = map->phys;
 		len = PAGE_ALIGN((start & ~PAGE_MASK) + map->size);
 		start &= PAGE_MASK;
-		if ((vma->vm_end - vma->vm_start + off) > len)
+		vma_len = get_vm_size(vma);
+
+		/* Overflow in off+len? */
+		if (vma_len + off < off)
+			return -EINVAL;
+		/* Does it fit in the mapping? */
+		if (vma_len + off > len)
 			return -EINVAL;
 
 		off += start;
-		vma->vm_pgoff = off >> PAGE_SHIFT;
+		/* Did that overflow? */
+		if (off < start)
+			return -EINVAL;
+		if (set_vm_offset(vma, off) < 0)
+			return -EINVAL;
 		vma->vm_flags |= VM_IO | VM_RESERVED;
 
 #ifdef pgprot_noncached

+ 1 - 1
drivers/net/ethernet/broadcom/bnx2.c

@@ -8564,7 +8564,7 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	return 0;
 
 error:
-	iounmap(bp->regview);
+	pci_iounmap(pdev, bp->regview);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 	pci_set_drvdata(pdev, NULL);

+ 1 - 3
drivers/net/ethernet/octeon/octeon_mgmt.c

@@ -722,10 +722,8 @@ static int octeon_mgmt_init_phy(struct net_device *netdev)
 				   octeon_mgmt_adjust_link, 0,
 				   PHY_INTERFACE_MODE_MII);
 
-	if (IS_ERR(p->phydev)) {
-		p->phydev = NULL;
+	if (!p->phydev)
 		return -1;
-	}
 
 	phy_start_aneg(p->phydev);
 

+ 2 - 2
drivers/net/ethernet/pasemi/pasemi_mac.c

@@ -1101,9 +1101,9 @@ static int pasemi_mac_phy_init(struct net_device *dev)
 	phydev = of_phy_connect(dev, phy_dn, &pasemi_adjust_link, 0,
 				PHY_INTERFACE_MODE_SGMII);
 
-	if (IS_ERR(phydev)) {
+	if (!phydev) {
 		printk(KERN_ERR "%s: Could not attach to phy\n", dev->name);
-		return PTR_ERR(phydev);
+		return -ENODEV;
 	}
 
 	mac->phydev = phydev;

+ 2 - 2
drivers/net/ethernet/qlogic/qlcnic/qlcnic_ctx.c

@@ -15,7 +15,7 @@ qlcnic_poll_rsp(struct qlcnic_adapter *adapter)
 
 	do {
 		/* give atleast 1ms for firmware to respond */
-		msleep(1);
+		mdelay(1);
 
 		if (++timeout > QLCNIC_OS_CRB_RETRY_COUNT)
 			return QLCNIC_CDRP_RSP_TIMEOUT;
@@ -601,7 +601,7 @@ void qlcnic_fw_destroy_ctx(struct qlcnic_adapter *adapter)
 		qlcnic_fw_cmd_destroy_tx_ctx(adapter);
 
 		/* Allow dma queues to drain after context reset */
-		msleep(20);
+		mdelay(20);
 	}
 }
 

+ 27 - 1
drivers/net/phy/smsc.c

@@ -56,6 +56,32 @@ static int smsc_phy_config_init(struct phy_device *phydev)
 	return smsc_phy_ack_interrupt (phydev);
 }
 
+static int lan87xx_config_init(struct phy_device *phydev)
+{
+	/*
+	 * Make sure the EDPWRDOWN bit is NOT set. Setting this bit on
+	 * LAN8710/LAN8720 PHY causes the PHY to misbehave, likely due
+	 * to a bug on the chip.
+	 *
+	 * When the system is powered on with the network cable being
+	 * disconnected all the way until after ifconfig ethX up is
+	 * issued for the LAN port with this PHY, connecting the cable
+	 * afterwards does not cause LINK change detection, while the
+	 * expected behavior is the Link UP being detected.
+	 */
+	int rc = phy_read(phydev, MII_LAN83C185_CTRL_STATUS);
+	if (rc < 0)
+		return rc;
+
+	rc &= ~MII_LAN83C185_EDPWRDOWN;
+
+	rc = phy_write(phydev, MII_LAN83C185_CTRL_STATUS, rc);
+	if (rc < 0)
+		return rc;
+
+	return smsc_phy_ack_interrupt(phydev);
+}
+
 static int lan911x_config_init(struct phy_device *phydev)
 {
 	return smsc_phy_ack_interrupt(phydev);
@@ -162,7 +188,7 @@ static struct phy_driver smsc_phy_driver[] = {
 	/* basic functions */
 	.config_aneg	= genphy_config_aneg,
 	.read_status	= genphy_read_status,
-	.config_init	= smsc_phy_config_init,
+	.config_init	= lan87xx_config_init,
 
 	/* IRQ related */
 	.ack_interrupt	= smsc_phy_ack_interrupt,

+ 6 - 6
drivers/net/team/team.c

@@ -1653,8 +1653,8 @@ static int team_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info)
 
 	hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq,
 			  &team_nl_family, 0, TEAM_CMD_NOOP);
-	if (IS_ERR(hdr)) {
-		err = PTR_ERR(hdr);
+	if (!hdr) {
+		err = -EMSGSIZE;
 		goto err_msg_put;
 	}
 
@@ -1848,8 +1848,8 @@ start_again:
 
 	hdr = genlmsg_put(skb, pid, seq, &team_nl_family, flags | NLM_F_MULTI,
 			  TEAM_CMD_OPTIONS_GET);
-	if (IS_ERR(hdr))
-		return PTR_ERR(hdr);
+	if (!hdr)
+		return -EMSGSIZE;
 
 	if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex))
 		goto nla_put_failure;
@@ -2068,8 +2068,8 @@ static int team_nl_fill_port_list_get(struct sk_buff *skb,
 
 	hdr = genlmsg_put(skb, pid, seq, &team_nl_family, flags,
 			  TEAM_CMD_PORT_LIST_GET);
-	if (IS_ERR(hdr))
-		return PTR_ERR(hdr);
+	if (!hdr)
+		return -EMSGSIZE;
 
 	if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex))
 		goto nla_put_failure;

+ 1 - 0
drivers/net/usb/smsc75xx.c

@@ -1253,6 +1253,7 @@ static struct usb_driver smsc75xx_driver = {
 	.probe		= usbnet_probe,
 	.suspend	= usbnet_suspend,
 	.resume		= usbnet_resume,
+	.reset_resume	= usbnet_resume,
 	.disconnect	= usbnet_disconnect,
 	.disable_hub_initiated_lpm = 1,
 };

+ 6 - 0
fs/dcache.c

@@ -1134,6 +1134,8 @@ positive:
 	return 1;
 
 rename_retry:
+	if (locked)
+		goto again;
 	locked = 1;
 	write_seqlock(&rename_lock);
 	goto again;
@@ -1236,6 +1238,8 @@ out:
 rename_retry:
 	if (found)
 		return found;
+	if (locked)
+		goto again;
 	locked = 1;
 	write_seqlock(&rename_lock);
 	goto again;
@@ -3035,6 +3039,8 @@ resume:
 	return;
 
 rename_retry:
+	if (locked)
+		goto again;
 	locked = 1;
 	write_seqlock(&rename_lock);
 	goto again;

+ 1 - 2
fs/lockd/svclock.c

@@ -289,7 +289,6 @@ static void nlmsvc_free_block(struct kref *kref)
 	dprintk("lockd: freeing block %p...\n", block);
 
 	/* Remove block from file's list of blocks */
-	mutex_lock(&file->f_mutex);
 	list_del_init(&block->b_flist);
 	mutex_unlock(&file->f_mutex);
 
@@ -303,7 +302,7 @@ static void nlmsvc_free_block(struct kref *kref)
 static void nlmsvc_release_block(struct nlm_block *block)
 {
 	if (block != NULL)
-		kref_put(&block->b_count, nlmsvc_free_block);
+		kref_put_mutex(&block->b_count, nlmsvc_free_block, &block->b_file->f_mutex);
 }
 
 /*

+ 8 - 2
fs/namespace.c

@@ -1886,8 +1886,14 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 		return err;
 
 	err = -EINVAL;
-	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(real_mount(path->mnt)))
-		goto unlock;
+	if (unlikely(!check_mnt(real_mount(path->mnt)))) {
+		/* that's acceptable only for automounts done in private ns */
+		if (!(mnt_flags & MNT_SHRINKABLE))
+			goto unlock;
+		/* ... and for those we'd better have mountpoint still alive */
+		if (!real_mount(path->mnt)->mnt_ns)
+			goto unlock;
+	}
 
 	/* Refuse the same filesystem on the same mount point */
 	err = -EBUSY;

+ 24 - 18
include/linux/iommu.h

@@ -256,72 +256,78 @@ static inline void iommu_set_fault_handler(struct iommu_domain *domain,
 {
 }
 
-int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group)
+static inline int iommu_attach_group(struct iommu_domain *domain,
+				     struct iommu_group *group)
 {
 	return -ENODEV;
 }
 
-void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group)
+static inline void iommu_detach_group(struct iommu_domain *domain,
+				      struct iommu_group *group)
 {
 }
 
-struct iommu_group *iommu_group_alloc(void)
+static inline struct iommu_group *iommu_group_alloc(void)
 {
 	return ERR_PTR(-ENODEV);
 }
 
-void *iommu_group_get_iommudata(struct iommu_group *group)
+static inline void *iommu_group_get_iommudata(struct iommu_group *group)
 {
 	return NULL;
 }
 
-void iommu_group_set_iommudata(struct iommu_group *group, void *iommu_data,
-			       void (*release)(void *iommu_data))
+static inline void iommu_group_set_iommudata(struct iommu_group *group,
+					     void *iommu_data,
+					     void (*release)(void *iommu_data))
 {
 }
 
-int iommu_group_set_name(struct iommu_group *group, const char *name)
+static inline int iommu_group_set_name(struct iommu_group *group,
+				       const char *name)
 {
 	return -ENODEV;
 }
 
-int iommu_group_add_device(struct iommu_group *group, struct device *dev)
+static inline int iommu_group_add_device(struct iommu_group *group,
+					 struct device *dev)
 {
 	return -ENODEV;
 }
 
-void iommu_group_remove_device(struct device *dev)
+static inline void iommu_group_remove_device(struct device *dev)
 {
 }
 
-int iommu_group_for_each_dev(struct iommu_group *group, void *data,
-			     int (*fn)(struct device *, void *))
+static inline int iommu_group_for_each_dev(struct iommu_group *group,
+					   void *data,
+					   int (*fn)(struct device *, void *))
 {
 	return -ENODEV;
 }
 
-struct iommu_group *iommu_group_get(struct device *dev)
+static inline struct iommu_group *iommu_group_get(struct device *dev)
 {
 	return NULL;
 }
 
-void iommu_group_put(struct iommu_group *group)
+static inline void iommu_group_put(struct iommu_group *group)
 {
 }
 
-int iommu_group_register_notifier(struct iommu_group *group,
-				  struct notifier_block *nb)
+static inline int iommu_group_register_notifier(struct iommu_group *group,
+						struct notifier_block *nb)
 {
 	return -ENODEV;
 }
 
-int iommu_group_unregister_notifier(struct iommu_group *group,
-				    struct notifier_block *nb)
+static inline int iommu_group_unregister_notifier(struct iommu_group *group,
+						  struct notifier_block *nb)
 {
 	return 0;
 }
 
-int iommu_group_id(struct iommu_group *group)
+static inline int iommu_group_id(struct iommu_group *group)
 {
 	return -ENODEV;
 }

+ 2 - 0
include/linux/nvme.h

@@ -35,8 +35,10 @@ struct nvme_bar {
 	__u64			acq;	/* Admin CQ Base Address */
 };
 
+#define NVME_CAP_MQES(cap)	((cap) & 0xffff)
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
 #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
+#define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)
 
 enum {
 	NVME_CC_ENABLE		= 1 << 0,

+ 0 - 1
mm/huge_memory.c

@@ -1811,7 +1811,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 			src_page = pte_page(pteval);
 			copy_user_highpage(page, src_page, address, vma);
 			VM_BUG_ON(page_mapcount(src_page) != 1);
-			VM_BUG_ON(page_count(src_page) != 2);
 			release_pte_page(src_page);
 			/*
 			 * ptl mostly unnecessary, but preempt has to

+ 4 - 1
net/ipv4/inetpeer.c

@@ -510,7 +510,10 @@ relookup:
 					secure_ipv6_id(daddr->addr.a6));
 		p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
 		p->rate_tokens = 0;
-		p->rate_last = 0;
+		/* 60*HZ is arbitrary, but chosen enough high so that the first
+		 * calculation of tokens is at its maximum.
+		 */
+		p->rate_last = jiffies - 60*HZ;
 		INIT_LIST_HEAD(&p->gc_list);
 
 		/* Link the node. */

+ 6 - 6
net/l2tp/l2tp_netlink.c

@@ -80,8 +80,8 @@ static int l2tp_nl_cmd_noop(struct sk_buff *skb, struct genl_info *info)
 
 	hdr = genlmsg_put(msg, info->snd_pid, info->snd_seq,
 			  &l2tp_nl_family, 0, L2TP_CMD_NOOP);
-	if (IS_ERR(hdr)) {
-		ret = PTR_ERR(hdr);
+	if (!hdr) {
+		ret = -EMSGSIZE;
 		goto err_out;
 	}
 
@@ -250,8 +250,8 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 pid, u32 seq, int flags,
 
 	hdr = genlmsg_put(skb, pid, seq, &l2tp_nl_family, flags,
 			  L2TP_CMD_TUNNEL_GET);
-	if (IS_ERR(hdr))
-		return PTR_ERR(hdr);
+	if (!hdr)
+		return -EMSGSIZE;
 
 	if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) ||
 	    nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) ||
@@ -617,8 +617,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 pid, u32 seq, int flags
 	sk = tunnel->sock;
 
 	hdr = genlmsg_put(skb, pid, seq, &l2tp_nl_family, flags, L2TP_CMD_SESSION_GET);
-	if (IS_ERR(hdr))
-		return PTR_ERR(hdr);
+	if (!hdr)
+		return -EMSGSIZE;
 
 	if (nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) ||
 	    nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) ||

+ 4 - 4
net/netfilter/xt_limit.c

@@ -117,11 +117,11 @@ static int limit_mt_check(const struct xt_mtchk_param *par)
 
 	/* For SMP, we only want to use one set of state. */
 	r->master = priv;
+	/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
+	   128. */
+	priv->prev = jiffies;
+	priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
 	if (r->cost == 0) {
-		/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
-		   128. */
-		priv->prev = jiffies;
-		priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
 		r->credit_cap = priv->credit; /* Credits full. */
 		r->cost = user2credits(r->avg);
 	}