11 years ago · f47671e2d8
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -5,6 +5,7 @@ config ARM
 
				 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
			
 
				 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
			
 
				 	select ARCH_HAVE_CUSTOM_GPIO_H
			
 
				+	select ARCH_USE_CMPXCHG_LOCKREF
			
 
				 	select ARCH_WANT_IPC_PARSE_VERSION
			
 
				 	select BUILDTIME_EXTABLE_SORT if MMU
			
 
				 	select CLONE_BACKWARDS
			
@@ -51,6 +52,8 @@ config ARM
 
				 	select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
			
 
				 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
			
 
				 	select HAVE_PERF_EVENTS
			
 
				+	select HAVE_PERF_REGS
			
 
				+	select HAVE_PERF_USER_STACK_DUMP
			
 
				 	select HAVE_REGS_AND_STACK_ACCESS_API
			
 
				 	select HAVE_SYSCALL_TRACEPOINTS
			
 
				 	select HAVE_UID16
			
@@ -481,6 +484,7 @@ config ARCH_IXP4XX
 
				 	bool "IXP4xx-based"
			
 
				 	depends on MMU
			
 
				 	select ARCH_HAS_DMA_SET_COHERENT_MASK
			
 
				+	select ARCH_SUPPORTS_BIG_ENDIAN
			
 
				 	select ARCH_REQUIRE_GPIOLIB
			
 
				 	select CLKSRC_MMIO
			
 
				 	select CPU_XSCALE
			
@@ -688,7 +692,6 @@ config ARCH_SA1100
 
				 	select GENERIC_CLOCKEVENTS
			
 
				 	select HAVE_IDE
			
 
				 	select ISA
			
 
				-	select NEED_MACH_GPIO_H
			
 
				 	select NEED_MACH_MEMORY_H
			
 
				 	select SPARSE_IRQ
			
 
				 	help
			
@@ -1064,11 +1067,6 @@ config IWMMXT
 
				 	  Enable support for iWMMXt context switching at run time if
			
 
				 	  running on a CPU that supports it.
			
 
				 
			
 
				-config XSCALE_PMU
			
 
				-	bool
			
 
				-	depends on CPU_XSCALE
			
 
				-	default y
			
 
				-
			
 
				 config MULTI_IRQ_HANDLER
			
 
				 	bool
			
 
				 	help
			
@@ -1516,6 +1514,32 @@ config MCPM
 
				 	  for (multi-)cluster based systems, such as big.LITTLE based
			
 
				 	  systems.
			
 
				 
			
 
				+config BIG_LITTLE
			
 
				+	bool "big.LITTLE support (Experimental)"
			
 
				+	depends on CPU_V7 && SMP
			
 
				+	select MCPM
			
 
				+	help
			
 
				+	  This option enables support selections for the big.LITTLE
			
 
				+	  system architecture.
			
 
				+
			
 
				+config BL_SWITCHER
			
 
				+	bool "big.LITTLE switcher support"
			
 
				+	depends on BIG_LITTLE && MCPM && HOTPLUG_CPU
			
 
				+	select CPU_PM
			
 
				+	select ARM_CPU_SUSPEND
			
 
				+	help
			
 
				+	  The big.LITTLE "switcher" provides the core functionality to
			
 
				+	  transparently handle transition between a cluster of A15's
			
 
				+	  and a cluster of A7's in a big.LITTLE system.
			
 
				+
			
 
				+config BL_SWITCHER_DUMMY_IF
			
 
				+	tristate "Simple big.LITTLE switcher user interface"
			
 
				+	depends on BL_SWITCHER && DEBUG_KERNEL
			
 
				+	help
			
 
				+	  This is a simple and dummy char dev interface to control
			
 
				+	  the big.LITTLE switcher core code.  It is meant for
			
 
				+	  debugging purposes only.
			
 
				+
			
 
				 choice
			
 
				 	prompt "Memory split"
			
 
				 	default VMSPLIT_3G
			
--- a/arch/arm/Kconfig.debug
+++ b/arch/arm/Kconfig.debug
@@ -318,6 +318,7 @@ choice
 
				 	config DEBUG_MSM_UART1
			
 
				 		bool "Kernel low-level debugging messages via MSM UART1"
			
 
				 		depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
			
 
				+		select DEBUG_MSM_UART
			
 
				 		help
			
 
				 		  Say Y here if you want the debug print routines to direct
			
 
				 		  their output to the first serial port on MSM devices.
			
@@ -325,6 +326,7 @@ choice
 
				 	config DEBUG_MSM_UART2
			
 
				 		bool "Kernel low-level debugging messages via MSM UART2"
			
 
				 		depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
			
 
				+		select DEBUG_MSM_UART
			
 
				 		help
			
 
				 		  Say Y here if you want the debug print routines to direct
			
 
				 		  their output to the second serial port on MSM devices.
			
@@ -332,6 +334,7 @@ choice
 
				 	config DEBUG_MSM_UART3
			
 
				 		bool "Kernel low-level debugging messages via MSM UART3"
			
 
				 		depends on ARCH_MSM7X00A || ARCH_MSM7X30 || ARCH_QSD8X50
			
 
				+		select DEBUG_MSM_UART
			
 
				 		help
			
 
				 		  Say Y here if you want the debug print routines to direct
			
 
				 		  their output to the third serial port on MSM devices.
			
@@ -340,6 +343,7 @@ choice
 
				 		bool "Kernel low-level debugging messages via MSM 8660 UART"
			
 
				 		depends on ARCH_MSM8X60
			
 
				 		select MSM_HAS_DEBUG_UART_HS
			
 
				+		select DEBUG_MSM_UART
			
 
				 		help
			
 
				 		  Say Y here if you want the debug print routines to direct
			
 
				 		  their output to the serial port on MSM 8660 devices.
			
@@ -348,10 +352,20 @@ choice
 
				 		bool "Kernel low-level debugging messages via MSM 8960 UART"
			
 
				 		depends on ARCH_MSM8960
			
 
				 		select MSM_HAS_DEBUG_UART_HS
			
 
				+		select DEBUG_MSM_UART
			
 
				 		help
			
 
				 		  Say Y here if you want the debug print routines to direct
			
 
				 		  their output to the serial port on MSM 8960 devices.
			
 
				 
			
 
				+	config DEBUG_MSM8974_UART
			
 
				+		bool "Kernel low-level debugging messages via MSM 8974 UART"
			
 
				+		depends on ARCH_MSM8974
			
 
				+		select MSM_HAS_DEBUG_UART_HS
			
 
				+		select DEBUG_MSM_UART
			
 
				+		help
			
 
				+		  Say Y here if you want the debug print routines to direct
			
 
				+		  their output to the serial port on MSM 8974 devices.
			
 
				+
			
 
				 	config DEBUG_MVEBU_UART
			
 
				 		bool "Kernel low-level debugging messages via MVEBU UART (old bootloaders)"
			
 
				 		depends on ARCH_MVEBU
			
@@ -841,6 +855,20 @@ choice
 
				 		  options; the platform specific options are deprecated
			
 
				 		  and will be soon removed.
			
 
				 
			
 
				+	config DEBUG_LL_UART_EFM32
			
 
				+		bool "Kernel low-level debugging via efm32 UART"
			
 
				+		depends on ARCH_EFM32
			
 
				+		help
			
 
				+		  Say Y here if you want the debug print routines to direct
			
 
				+		  their output to an UART or USART port on efm32 based
			
 
				+		  machines. Use the following addresses for DEBUG_UART_PHYS:
			
 
				+
			
 
				+		    0x4000c000 | USART0
			
 
				+		    0x4000c400 | USART1
			
 
				+		    0x4000c800 | USART2
			
 
				+		    0x4000e000 | UART0
			
 
				+		    0x4000e400 | UART1
			
 
				+
			
 
				 	config DEBUG_LL_UART_PL01X
			
 
				 		bool "Kernel low-level debugging via ARM Ltd PL01x Primecell UART"
			
 
				 		help
			
@@ -887,11 +915,16 @@ config DEBUG_STI_UART
 
				 	bool
			
 
				 	depends on ARCH_STI
			
 
				 
			
 
				+config DEBUG_MSM_UART
			
 
				+	bool
			
 
				+	depends on ARCH_MSM
			
 
				+
			
 
				 config DEBUG_LL_INCLUDE
			
 
				 	string
			
 
				 	default "debug/8250.S" if DEBUG_LL_UART_8250 || DEBUG_UART_8250
			
 
				 	default "debug/pl01x.S" if DEBUG_LL_UART_PL01X || DEBUG_UART_PL01X
			
 
				 	default "debug/exynos.S" if DEBUG_EXYNOS_UART
			
 
				+	default "debug/efm32.S" if DEBUG_LL_UART_EFM32
			
 
				 	default "debug/icedcc.S" if DEBUG_ICEDCC
			
 
				 	default "debug/imx.S" if DEBUG_IMX1_UART || \
			
 
				 				 DEBUG_IMX25_UART || \
			
@@ -902,11 +935,7 @@ config DEBUG_LL_INCLUDE
 
				 				 DEBUG_IMX53_UART ||\
			
 
				 				 DEBUG_IMX6Q_UART || \
			
 
				 				 DEBUG_IMX6SL_UART
			
 
				-	default "debug/msm.S" if DEBUG_MSM_UART1 || \
			
 
				-				 DEBUG_MSM_UART2 || \
			
 
				-				 DEBUG_MSM_UART3 || \
			
 
				-				 DEBUG_MSM8660_UART || \
			
 
				-				 DEBUG_MSM8960_UART
			
 
				+	default "debug/msm.S" if DEBUG_MSM_UART
			
 
				 	default "debug/omap2plus.S" if DEBUG_OMAP2PLUS_UART
			
 
				 	default "debug/sirf.S" if DEBUG_SIRFPRIMA2_UART1 || DEBUG_SIRFMARCO_UART1
			
 
				 	default "debug/sti.S" if DEBUG_STI_UART
			
@@ -959,6 +988,7 @@ config DEBUG_UART_PHYS
 
				 	default 0x20064000 if DEBUG_RK29_UART1 || DEBUG_RK3X_UART2
			
 
				 	default 0x20068000 if DEBUG_RK29_UART2 || DEBUG_RK3X_UART3
			
 
				 	default 0x20201000 if DEBUG_BCM2835
			
 
				+	default 0x4000e400 if DEBUG_LL_UART_EFM32
			
 
				 	default 0x40090000 if ARCH_LPC32XX
			
 
				 	default 0x40100000 if DEBUG_PXA_UART1
			
 
				 	default 0x42000000 if ARCH_GEMINI
			
@@ -989,6 +1019,7 @@ config DEBUG_UART_PHYS
 
				 	default 0xfff36000 if DEBUG_HIGHBANK_UART
			
 
				 	default 0xfffff700 if ARCH_IOP33X
			
 
				 	depends on DEBUG_LL_UART_8250 || DEBUG_LL_UART_PL01X || \
			
 
				+		DEBUG_LL_UART_EFM32 || \
			
 
				 		DEBUG_UART_8250 || DEBUG_UART_PL01X
			
 
				 
			
 
				 config DEBUG_UART_VIRT
			
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -16,6 +16,7 @@ LDFLAGS		:=
 
				 LDFLAGS_vmlinux	:=-p --no-undefined -X
			
 
				 ifeq ($(CONFIG_CPU_ENDIAN_BE8),y)
			
 
				 LDFLAGS_vmlinux	+= --be8
			
 
				+LDFLAGS_MODULE	+= --be8
			
 
				 endif
			
 
				 
			
 
				 OBJCOPYFLAGS	:=-O binary -R .comment -S
			
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -135,6 +135,7 @@ start:
 
				 		.word	_edata			@ zImage end address
			
 
				  THUMB(		.thumb			)
			
 
				 1:
			
 
				+ ARM_BE8(	setend	be )			@ go BE8 if compiled for BE8
			
 
				 		mrs	r9, cpsr
			
 
				 #ifdef CONFIG_ARM_VIRT_EXT
			
 
				 		bl	__hyp_stub_install	@ get into SVC mode, reversibly
			
@@ -699,9 +700,7 @@ __armv4_mmu_cache_on:
 
				 		mrc	p15, 0, r0, c1, c0, 0	@ read control reg
			
 
				 		orr	r0, r0, #0x5000		@ I-cache enable, RR cache replacement
			
 
				 		orr	r0, r0, #0x0030
			
 
				-#ifdef CONFIG_CPU_ENDIAN_BE8
			
 
				-		orr	r0, r0, #1 << 25	@ big-endian page tables
			
 
				-#endif
			
 
				+ ARM_BE8(	orr	r0, r0, #1 << 25 )	@ big-endian page tables
			
 
				 		bl	__common_mmu_cache_on
			
 
				 		mov	r0, #0
			
 
				 		mcr	p15, 0, r0, c8, c7, 0	@ flush I,D TLBs
			
@@ -728,9 +727,7 @@ __armv7_mmu_cache_on:
 
				 		orr	r0, r0, #1 << 22	@ U (v6 unaligned access model)
			
 
				 						@ (needed for ARM1176)
			
 
				 #ifdef CONFIG_MMU
			
 
				-#ifdef CONFIG_CPU_ENDIAN_BE8
			
 
				-		orr	r0, r0, #1 << 25	@ big-endian page tables
			
 
				-#endif
			
 
				+ ARM_BE8(	orr	r0, r0, #1 << 25 )	@ big-endian page tables
			
 
				 		mrcne   p15, 0, r6, c2, c0, 2   @ read ttb control reg
			
 
				 		orrne	r0, r0, #1		@ MMU enabled
			
 
				 		movne	r1, #0xfffffffd		@ domain 0 = client
			
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -16,3 +16,5 @@ obj-$(CONFIG_MCPM)		+= mcpm_head.o mcpm_entry.o mcpm_platsmp.o vlock.o
 
				 AFLAGS_mcpm_head.o		:= -march=armv7-a
			
 
				 AFLAGS_vlock.o			:= -march=armv7-a
			
 
				 obj-$(CONFIG_TI_PRIV_EDMA)	+= edma.o
			
 
				+obj-$(CONFIG_BL_SWITCHER)	+= bL_switcher.o
			
 
				+obj-$(CONFIG_BL_SWITCHER_DUMMY_IF) += bL_switcher_dummy_if.o
			
--- a/arch/arm/common/bL_switcher.c
+++ b/arch/arm/common/bL_switcher.c
@@ -0,0 +1,822 @@
 
				+/*
			
 
				+ * arch/arm/common/bL_switcher.c -- big.LITTLE cluster switcher core driver
			
 
				+ *
			
 
				+ * Created by:	Nicolas Pitre, March 2012
			
 
				+ * Copyright:	(C) 2012-2013  Linaro Limited
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/atomic.h>
			
 
				+#include <linux/init.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/interrupt.h>
			
 
				+#include <linux/cpu_pm.h>
			
 
				+#include <linux/cpu.h>
			
 
				+#include <linux/cpumask.h>
			
 
				+#include <linux/kthread.h>
			
 
				+#include <linux/wait.h>
			
 
				+#include <linux/time.h>
			
 
				+#include <linux/clockchips.h>
			
 
				+#include <linux/hrtimer.h>
			
 
				+#include <linux/tick.h>
			
 
				+#include <linux/notifier.h>
			
 
				+#include <linux/mm.h>
			
 
				+#include <linux/mutex.h>
			
 
				+#include <linux/smp.h>
			
 
				+#include <linux/spinlock.h>
			
 
				+#include <linux/string.h>
			
 
				+#include <linux/sysfs.h>
			
 
				+#include <linux/irqchip/arm-gic.h>
			
 
				+#include <linux/moduleparam.h>
			
 
				+
			
 
				+#include <asm/smp_plat.h>
			
 
				+#include <asm/cputype.h>
			
 
				+#include <asm/suspend.h>
			
 
				+#include <asm/mcpm.h>
			
 
				+#include <asm/bL_switcher.h>
			
 
				+
			
 
				+#define CREATE_TRACE_POINTS
			
 
				+#include <trace/events/power_cpu_migrate.h>
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Use our own MPIDR accessors as the generic ones in asm/cputype.h have
			
 
				+ * __attribute_const__ and we don't want the compiler to assume any
			
 
				+ * constness here as the value _does_ change along some code paths.
			
 
				+ */
			
 
				+
			
 
				+static int read_mpidr(void)
			
 
				+{
			
 
				+	unsigned int id;
			
 
				+	asm volatile ("mrc p15, 0, %0, c0, c0, 5" : "=r" (id));
			
 
				+	return id & MPIDR_HWID_BITMASK;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Get a global nanosecond time stamp for tracing.
			
 
				+ */
			
 
				+static s64 get_ns(void)
			
 
				+{
			
 
				+	struct timespec ts;
			
 
				+	getnstimeofday(&ts);
			
 
				+	return timespec_to_ns(&ts);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * bL switcher core code.
			
 
				+ */
			
 
				+
			
 
				+static void bL_do_switch(void *_arg)
			
 
				+{
			
 
				+	unsigned ib_mpidr, ib_cpu, ib_cluster;
			
 
				+	long volatile handshake, **handshake_ptr = _arg;
			
 
				+
			
 
				+	pr_debug("%s\n", __func__);
			
 
				+
			
 
				+	ib_mpidr = cpu_logical_map(smp_processor_id());
			
 
				+	ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
			
 
				+	ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
			
 
				+
			
 
				+	/* Advertise our handshake location */
			
 
				+	if (handshake_ptr) {
			
 
				+		handshake = 0;
			
 
				+		*handshake_ptr = &handshake;
			
 
				+	} else
			
 
				+		handshake = -1;
			
 
				+
			
 
				+	/*
			
 
				+	 * Our state has been saved at this point.  Let's release our
			
 
				+	 * inbound CPU.
			
 
				+	 */
			
 
				+	mcpm_set_entry_vector(ib_cpu, ib_cluster, cpu_resume);
			
 
				+	sev();
			
 
				+
			
 
				+	/*
			
 
				+	 * From this point, we must assume that our counterpart CPU might
			
 
				+	 * have taken over in its parallel world already, as if execution
			
 
				+	 * just returned from cpu_suspend().  It is therefore important to
			
 
				+	 * be very careful not to make any change the other guy is not
			
 
				+	 * expecting.  This is why we need stack isolation.
			
 
				+	 *
			
 
				+	 * Fancy under cover tasks could be performed here.  For now
			
 
				+	 * we have none.
			
 
				+	 */
			
 
				+
			
 
				+	/*
			
 
				+	 * Let's wait until our inbound is alive.
			
 
				+	 */
			
 
				+	while (!handshake) {
			
 
				+		wfe();
			
 
				+		smp_mb();
			
 
				+	}
			
 
				+
			
 
				+	/* Let's put ourself down. */
			
 
				+	mcpm_cpu_power_down();
			
 
				+
			
 
				+	/* should never get here */
			
 
				+	BUG();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Stack isolation.  To ensure 'current' remains valid, we just use another
			
 
				+ * piece of our thread's stack space which should be fairly lightly used.
			
 
				+ * The selected area starts just above the thread_info structure located
			
 
				+ * at the very bottom of the stack, aligned to a cache line, and indexed
			
 
				+ * with the cluster number.
			
 
				+ */
			
 
				+#define STACK_SIZE 512
			
 
				+extern void call_with_stack(void (*fn)(void *), void *arg, void *sp);
			
 
				+static int bL_switchpoint(unsigned long _arg)
			
 
				+{
			
 
				+	unsigned int mpidr = read_mpidr();
			
 
				+	unsigned int clusterid = MPIDR_AFFINITY_LEVEL(mpidr, 1);
			
 
				+	void *stack = current_thread_info() + 1;
			
 
				+	stack = PTR_ALIGN(stack, L1_CACHE_BYTES);
			
 
				+	stack += clusterid * STACK_SIZE + STACK_SIZE;
			
 
				+	call_with_stack(bL_do_switch, (void *)_arg, stack);
			
 
				+	BUG();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Generic switcher interface
			
 
				+ */
			
 
				+
			
 
				+static unsigned int bL_gic_id[MAX_CPUS_PER_CLUSTER][MAX_NR_CLUSTERS];
			
 
				+static int bL_switcher_cpu_pairing[NR_CPUS];
			
 
				+
			
 
				+/*
			
 
				+ * bL_switch_to - Switch to a specific cluster for the current CPU
			
 
				+ * @new_cluster_id: the ID of the cluster to switch to.
			
 
				+ *
			
 
				+ * This function must be called on the CPU to be switched.
			
 
				+ * Returns 0 on success, else a negative status code.
			
 
				+ */
			
 
				+static int bL_switch_to(unsigned int new_cluster_id)
			
 
				+{
			
 
				+	unsigned int mpidr, this_cpu, that_cpu;
			
 
				+	unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster;
			
 
				+	struct completion inbound_alive;
			
 
				+	struct tick_device *tdev;
			
 
				+	enum clock_event_mode tdev_mode;
			
 
				+	long volatile *handshake_ptr;
			
 
				+	int ipi_nr, ret;
			
 
				+
			
 
				+	this_cpu = smp_processor_id();
			
 
				+	ob_mpidr = read_mpidr();
			
 
				+	ob_cpu = MPIDR_AFFINITY_LEVEL(ob_mpidr, 0);
			
 
				+	ob_cluster = MPIDR_AFFINITY_LEVEL(ob_mpidr, 1);
			
 
				+	BUG_ON(cpu_logical_map(this_cpu) != ob_mpidr);
			
 
				+
			
 
				+	if (new_cluster_id == ob_cluster)
			
 
				+		return 0;
			
 
				+
			
 
				+	that_cpu = bL_switcher_cpu_pairing[this_cpu];
			
 
				+	ib_mpidr = cpu_logical_map(that_cpu);
			
 
				+	ib_cpu = MPIDR_AFFINITY_LEVEL(ib_mpidr, 0);
			
 
				+	ib_cluster = MPIDR_AFFINITY_LEVEL(ib_mpidr, 1);
			
 
				+
			
 
				+	pr_debug("before switch: CPU %d MPIDR %#x -> %#x\n",
			
 
				+		 this_cpu, ob_mpidr, ib_mpidr);
			
 
				+
			
 
				+	this_cpu = smp_processor_id();
			
 
				+
			
 
				+	/* Close the gate for our entry vectors */
			
 
				+	mcpm_set_entry_vector(ob_cpu, ob_cluster, NULL);
			
 
				+	mcpm_set_entry_vector(ib_cpu, ib_cluster, NULL);
			
 
				+
			
 
				+	/* Install our "inbound alive" notifier. */
			
 
				+	init_completion(&inbound_alive);
			
 
				+	ipi_nr = register_ipi_completion(&inbound_alive, this_cpu);
			
 
				+	ipi_nr |= ((1 << 16) << bL_gic_id[ob_cpu][ob_cluster]);
			
 
				+	mcpm_set_early_poke(ib_cpu, ib_cluster, gic_get_sgir_physaddr(), ipi_nr);
			
 
				+
			
 
				+	/*
			
 
				+	 * Let's wake up the inbound CPU now in case it requires some delay
			
 
				+	 * to come online, but leave it gated in our entry vector code.
			
 
				+	 */
			
 
				+	ret = mcpm_cpu_power_up(ib_cpu, ib_cluster);
			
 
				+	if (ret) {
			
 
				+		pr_err("%s: mcpm_cpu_power_up() returned %d\n", __func__, ret);
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Raise a SGI on the inbound CPU to make sure it doesn't stall
			
 
				+	 * in a possible WFI, such as in bL_power_down().
			
 
				+	 */
			
 
				+	gic_send_sgi(bL_gic_id[ib_cpu][ib_cluster], 0);
			
 
				+
			
 
				+	/*
			
 
				+	 * Wait for the inbound to come up.  This allows for other
			
 
				+	 * tasks to be scheduled in the mean time.
			
 
				+	 */
			
 
				+	wait_for_completion(&inbound_alive);
			
 
				+	mcpm_set_early_poke(ib_cpu, ib_cluster, 0, 0);
			
 
				+
			
 
				+	/*
			
 
				+	 * From this point we are entering the switch critical zone
			
 
				+	 * and can't take any interrupts anymore.
			
 
				+	 */
			
 
				+	local_irq_disable();
			
 
				+	local_fiq_disable();
			
 
				+	trace_cpu_migrate_begin(get_ns(), ob_mpidr);
			
 
				+
			
 
				+	/* redirect GIC's SGIs to our counterpart */
			
 
				+	gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]);
			
 
				+
			
 
				+	tdev = tick_get_device(this_cpu);
			
 
				+	if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu)))
			
 
				+		tdev = NULL;
			
 
				+	if (tdev) {
			
 
				+		tdev_mode = tdev->evtdev->mode;
			
 
				+		clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
			
 
				+	}
			
 
				+
			
 
				+	ret = cpu_pm_enter();
			
 
				+
			
 
				+	/* we can not tolerate errors at this point */
			
 
				+	if (ret)
			
 
				+		panic("%s: cpu_pm_enter() returned %d\n", __func__, ret);
			
 
				+
			
 
				+	/* Swap the physical CPUs in the logical map for this logical CPU. */
			
 
				+	cpu_logical_map(this_cpu) = ib_mpidr;
			
 
				+	cpu_logical_map(that_cpu) = ob_mpidr;
			
 
				+
			
 
				+	/* Let's do the actual CPU switch. */
			
 
				+	ret = cpu_suspend((unsigned long)&handshake_ptr, bL_switchpoint);
			
 
				+	if (ret > 0)
			
 
				+		panic("%s: cpu_suspend() returned %d\n", __func__, ret);
			
 
				+
			
 
				+	/* We are executing on the inbound CPU at this point */
			
 
				+	mpidr = read_mpidr();
			
 
				+	pr_debug("after switch: CPU %d MPIDR %#x\n", this_cpu, mpidr);
			
 
				+	BUG_ON(mpidr != ib_mpidr);
			
 
				+
			
 
				+	mcpm_cpu_powered_up();
			
 
				+
			
 
				+	ret = cpu_pm_exit();
			
 
				+
			
 
				+	if (tdev) {
			
 
				+		clockevents_set_mode(tdev->evtdev, tdev_mode);
			
 
				+		clockevents_program_event(tdev->evtdev,
			
 
				+					  tdev->evtdev->next_event, 1);
			
 
				+	}
			
 
				+
			
 
				+	trace_cpu_migrate_finish(get_ns(), ib_mpidr);
			
 
				+	local_fiq_enable();
			
 
				+	local_irq_enable();
			
 
				+
			
 
				+	*handshake_ptr = 1;
			
 
				+	dsb_sev();
			
 
				+
			
 
				+	if (ret)
			
 
				+		pr_err("%s exiting with error %d\n", __func__, ret);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+struct bL_thread {
			
 
				+	spinlock_t lock;
			
 
				+	struct task_struct *task;
			
 
				+	wait_queue_head_t wq;
			
 
				+	int wanted_cluster;
			
 
				+	struct completion started;
			
 
				+	bL_switch_completion_handler completer;
			
 
				+	void *completer_cookie;
			
 
				+};
			
 
				+
			
 
				+static struct bL_thread bL_threads[NR_CPUS];
			
 
				+
			
 
				+static int bL_switcher_thread(void *arg)
			
 
				+{
			
 
				+	struct bL_thread *t = arg;
			
 
				+	struct sched_param param = { .sched_priority = 1 };
			
 
				+	int cluster;
			
 
				+	bL_switch_completion_handler completer;
			
 
				+	void *completer_cookie;
			
 
				+
			
 
				+	sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
			
 
				+	complete(&t->started);
			
 
				+
			
 
				+	do {
			
 
				+		if (signal_pending(current))
			
 
				+			flush_signals(current);
			
 
				+		wait_event_interruptible(t->wq,
			
 
				+				t->wanted_cluster != -1 ||
			
 
				+				kthread_should_stop());
			
 
				+
			
 
				+		spin_lock(&t->lock);
			
 
				+		cluster = t->wanted_cluster;
			
 
				+		completer = t->completer;
			
 
				+		completer_cookie = t->completer_cookie;
			
 
				+		t->wanted_cluster = -1;
			
 
				+		t->completer = NULL;
			
 
				+		spin_unlock(&t->lock);
			
 
				+
			
 
				+		if (cluster != -1) {
			
 
				+			bL_switch_to(cluster);
			
 
				+
			
 
				+			if (completer)
			
 
				+				completer(completer_cookie);
			
 
				+		}
			
 
				+	} while (!kthread_should_stop());
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct task_struct *bL_switcher_thread_create(int cpu, void *arg)
			
 
				+{
			
 
				+	struct task_struct *task;
			
 
				+
			
 
				+	task = kthread_create_on_node(bL_switcher_thread, arg,
			
 
				+				      cpu_to_node(cpu), "kswitcher_%d", cpu);
			
 
				+	if (!IS_ERR(task)) {
			
 
				+		kthread_bind(task, cpu);
			
 
				+		wake_up_process(task);
			
 
				+	} else
			
 
				+		pr_err("%s failed for CPU %d\n", __func__, cpu);
			
 
				+	return task;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * bL_switch_request_cb - Switch to a specific cluster for the given CPU,
			
 
				+ *      with completion notification via a callback
			
 
				+ *
			
 
				+ * @cpu: the CPU to switch
			
 
				+ * @new_cluster_id: the ID of the cluster to switch to.
			
 
				+ * @completer: switch completion callback.  if non-NULL,
			
 
				+ *	@completer(@completer_cookie) will be called on completion of
			
 
				+ *	the switch, in non-atomic context.
			
 
				+ * @completer_cookie: opaque context argument for @completer.
			
 
				+ *
			
 
				+ * This function causes a cluster switch on the given CPU by waking up
			
 
				+ * the appropriate switcher thread.  This function may or may not return
			
 
				+ * before the switch has occurred.
			
 
				+ *
			
 
				+ * If a @completer callback function is supplied, it will be called when
			
 
				+ * the switch is complete.  This can be used to determine asynchronously
			
 
				+ * when the switch is complete, regardless of when bL_switch_request()
			
 
				+ * returns.  When @completer is supplied, no new switch request is permitted
			
 
				+ * for the affected CPU until after the switch is complete, and @completer
			
 
				+ * has returned.
			
 
				+ */
			
 
				+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
			
 
				+			 bL_switch_completion_handler completer,
			
 
				+			 void *completer_cookie)
			
 
				+{
			
 
				+	struct bL_thread *t;
			
 
				+
			
 
				+	if (cpu >= ARRAY_SIZE(bL_threads)) {
			
 
				+		pr_err("%s: cpu %d out of bounds\n", __func__, cpu);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	t = &bL_threads[cpu];
			
 
				+
			
 
				+	if (IS_ERR(t->task))
			
 
				+		return PTR_ERR(t->task);
			
 
				+	if (!t->task)
			
 
				+		return -ESRCH;
			
 
				+
			
 
				+	spin_lock(&t->lock);
			
 
				+	if (t->completer) {
			
 
				+		spin_unlock(&t->lock);
			
 
				+		return -EBUSY;
			
 
				+	}
			
 
				+	t->completer = completer;
			
 
				+	t->completer_cookie = completer_cookie;
			
 
				+	t->wanted_cluster = new_cluster_id;
			
 
				+	spin_unlock(&t->lock);
			
 
				+	wake_up(&t->wq);
			
 
				+	return 0;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(bL_switch_request_cb);
			
 
				+
			
 
				+/*
			
 
				+ * Activation and configuration code.
			
 
				+ */
			
 
				+
			
 
				+static DEFINE_MUTEX(bL_switcher_activation_lock);
			
 
				+static BLOCKING_NOTIFIER_HEAD(bL_activation_notifier);
			
 
				+static unsigned int bL_switcher_active;
			
 
				+static unsigned int bL_switcher_cpu_original_cluster[NR_CPUS];
			
 
				+static cpumask_t bL_switcher_removed_logical_cpus;
			
 
				+
			
 
				+int bL_switcher_register_notifier(struct notifier_block *nb)
			
 
				+{
			
 
				+	return blocking_notifier_chain_register(&bL_activation_notifier, nb);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(bL_switcher_register_notifier);
			
 
				+
			
 
				+int bL_switcher_unregister_notifier(struct notifier_block *nb)
			
 
				+{
			
 
				+	return blocking_notifier_chain_unregister(&bL_activation_notifier, nb);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(bL_switcher_unregister_notifier);
			
 
				+
			
 
				+static int bL_activation_notify(unsigned long val)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = blocking_notifier_call_chain(&bL_activation_notifier, val, NULL);
			
 
				+	if (ret & NOTIFY_STOP_MASK)
			
 
				+		pr_err("%s: notifier chain failed with status 0x%x\n",
			
 
				+			__func__, ret);
			
 
				+	return notifier_to_errno(ret);
			
 
				+}
			
 
				+
			
 
				+static void bL_switcher_restore_cpus(void)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for_each_cpu(i, &bL_switcher_removed_logical_cpus)
			
 
				+		cpu_up(i);
			
 
				+}
			
 
				+
			
 
				+static int bL_switcher_halve_cpus(void)
			
 
				+{
			
 
				+	int i, j, cluster_0, gic_id, ret;
			
 
				+	unsigned int cpu, cluster, mask;
			
 
				+	cpumask_t available_cpus;
			
 
				+
			
 
				+	/* First pass to validate what we have */
			
 
				+	mask = 0;
			
 
				+	for_each_online_cpu(i) {
			
 
				+		cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
			
 
				+		cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
			
 
				+		if (cluster >= 2) {
			
 
				+			pr_err("%s: only dual cluster systems are supported\n", __func__);
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+		if (WARN_ON(cpu >= MAX_CPUS_PER_CLUSTER))
			
 
				+			return -EINVAL;
			
 
				+		mask |= (1 << cluster);
			
 
				+	}
			
 
				+	if (mask != 3) {
			
 
				+		pr_err("%s: no CPU pairing possible\n", __func__);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Now let's do the pairing.  We match each CPU with another CPU
			
 
				+	 * from a different cluster.  To get a uniform scheduling behavior
			
 
				+	 * without fiddling with CPU topology and compute capacity data,
			
 
				+	 * we'll use logical CPUs initially belonging to the same cluster.
			
 
				+	 */
			
 
				+	memset(bL_switcher_cpu_pairing, -1, sizeof(bL_switcher_cpu_pairing));
			
 
				+	cpumask_copy(&available_cpus, cpu_online_mask);
			
 
				+	cluster_0 = -1;
			
 
				+	for_each_cpu(i, &available_cpus) {
			
 
				+		int match = -1;
			
 
				+		cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
			
 
				+		if (cluster_0 == -1)
			
 
				+			cluster_0 = cluster;
			
 
				+		if (cluster != cluster_0)
			
 
				+			continue;
			
 
				+		cpumask_clear_cpu(i, &available_cpus);
			
 
				+		for_each_cpu(j, &available_cpus) {
			
 
				+			cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(j), 1);
			
 
				+			/*
			
 
				+			 * Let's remember the last match to create "odd"
			
 
				+			 * pairings on purpose in order for other code not
			
 
				+			 * to assume any relation between physical and
			
 
				+			 * logical CPU numbers.
			
 
				+			 */
			
 
				+			if (cluster != cluster_0)
			
 
				+				match = j;
			
 
				+		}
			
 
				+		if (match != -1) {
			
 
				+			bL_switcher_cpu_pairing[i] = match;
			
 
				+			cpumask_clear_cpu(match, &available_cpus);
			
 
				+			pr_info("CPU%d paired with CPU%d\n", i, match);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Now we disable the unwanted CPUs i.e. everything that has no
			
 
				+	 * pairing information (that includes the pairing counterparts).
			
 
				+	 */
			
 
				+	cpumask_clear(&bL_switcher_removed_logical_cpus);
			
 
				+	for_each_online_cpu(i) {
			
 
				+		cpu = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 0);
			
 
				+		cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(i), 1);
			
 
				+
			
 
				+		/* Let's take note of the GIC ID for this CPU */
			
 
				+		gic_id = gic_get_cpu_id(i);
			
 
				+		if (gic_id < 0) {
			
 
				+			pr_err("%s: bad GIC ID for CPU %d\n", __func__, i);
			
 
				+			bL_switcher_restore_cpus();
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+		bL_gic_id[cpu][cluster] = gic_id;
			
 
				+		pr_info("GIC ID for CPU %u cluster %u is %u\n",
			
 
				+			cpu, cluster, gic_id);
			
 
				+
			
 
				+		if (bL_switcher_cpu_pairing[i] != -1) {
			
 
				+			bL_switcher_cpu_original_cluster[i] = cluster;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		ret = cpu_down(i);
			
 
				+		if (ret) {
			
 
				+			bL_switcher_restore_cpus();
			
 
				+			return ret;
			
 
				+		}
			
 
				+		cpumask_set_cpu(i, &bL_switcher_removed_logical_cpus);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* Determine the logical CPU a given physical CPU is grouped on. */
			
 
				+int bL_switcher_get_logical_index(u32 mpidr)
			
 
				+{
			
 
				+	int cpu;
			
 
				+
			
 
				+	if (!bL_switcher_active)
			
 
				+		return -EUNATCH;
			
 
				+
			
 
				+	mpidr &= MPIDR_HWID_BITMASK;
			
 
				+	for_each_online_cpu(cpu) {
			
 
				+		int pairing = bL_switcher_cpu_pairing[cpu];
			
 
				+		if (pairing == -1)
			
 
				+			continue;
			
 
				+		if ((mpidr == cpu_logical_map(cpu)) ||
			
 
				+		    (mpidr == cpu_logical_map(pairing)))
			
 
				+			return cpu;
			
 
				+	}
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static void bL_switcher_trace_trigger_cpu(void *__always_unused info)
			
 
				+{
			
 
				+	trace_cpu_migrate_current(get_ns(), read_mpidr());
			
 
				+}
			
 
				+
			
 
				+int bL_switcher_trace_trigger(void)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	preempt_disable();
			
 
				+
			
 
				+	bL_switcher_trace_trigger_cpu(NULL);
			
 
				+	ret = smp_call_function(bL_switcher_trace_trigger_cpu, NULL, true);
			
 
				+
			
 
				+	preempt_enable();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(bL_switcher_trace_trigger);
			
 
				+
			
 
				+static int bL_switcher_enable(void)
			
 
				+{
			
 
				+	int cpu, ret;
			
 
				+
			
 
				+	mutex_lock(&bL_switcher_activation_lock);
			
 
				+	lock_device_hotplug();
			
 
				+	if (bL_switcher_active) {
			
 
				+		unlock_device_hotplug();
			
 
				+		mutex_unlock(&bL_switcher_activation_lock);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	pr_info("big.LITTLE switcher initializing\n");
			
 
				+
			
 
				+	ret = bL_activation_notify(BL_NOTIFY_PRE_ENABLE);
			
 
				+	if (ret)
			
 
				+		goto error;
			
 
				+
			
 
				+	ret = bL_switcher_halve_cpus();
			
 
				+	if (ret)
			
 
				+		goto error;
			
 
				+
			
 
				+	bL_switcher_trace_trigger();
			
 
				+
			
 
				+	for_each_online_cpu(cpu) {
			
 
				+		struct bL_thread *t = &bL_threads[cpu];
			
 
				+		spin_lock_init(&t->lock);
			
 
				+		init_waitqueue_head(&t->wq);
			
 
				+		init_completion(&t->started);
			
 
				+		t->wanted_cluster = -1;
			
 
				+		t->task = bL_switcher_thread_create(cpu, t);
			
 
				+	}
			
 
				+
			
 
				+	bL_switcher_active = 1;
			
 
				+	bL_activation_notify(BL_NOTIFY_POST_ENABLE);
			
 
				+	pr_info("big.LITTLE switcher initialized\n");
			
 
				+	goto out;
			
 
				+
			
 
				+error:
			
 
				+	pr_warn("big.LITTLE switcher initialization failed\n");
			
 
				+	bL_activation_notify(BL_NOTIFY_POST_DISABLE);
			
 
				+
			
 
				+out:
			
 
				+	unlock_device_hotplug();
			
 
				+	mutex_unlock(&bL_switcher_activation_lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_SYSFS
			
 
				+
			
 
				+static void bL_switcher_disable(void)
			
 
				+{
			
 
				+	unsigned int cpu, cluster;
			
 
				+	struct bL_thread *t;
			
 
				+	struct task_struct *task;
			
 
				+
			
 
				+	mutex_lock(&bL_switcher_activation_lock);
			
 
				+	lock_device_hotplug();
			
 
				+
			
 
				+	if (!bL_switcher_active)
			
 
				+		goto out;
			
 
				+
			
 
				+	if (bL_activation_notify(BL_NOTIFY_PRE_DISABLE) != 0) {
			
 
				+		bL_activation_notify(BL_NOTIFY_POST_ENABLE);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	bL_switcher_active = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * To deactivate the switcher, we must shut down the switcher
			
 
				+	 * threads to prevent any other requests from being accepted.
			
 
				+	 * Then, if the final cluster for given logical CPU is not the
			
 
				+	 * same as the original one, we'll recreate a switcher thread
			
 
				+	 * just for the purpose of switching the CPU back without any
			
 
				+	 * possibility for interference from external requests.
			
 
				+	 */
			
 
				+	for_each_online_cpu(cpu) {
			
 
				+		t = &bL_threads[cpu];
			
 
				+		task = t->task;
			
 
				+		t->task = NULL;
			
 
				+		if (!task || IS_ERR(task))
			
 
				+			continue;
			
 
				+		kthread_stop(task);
			
 
				+		/* no more switch may happen on this CPU at this point */
			
 
				+		cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
			
 
				+		if (cluster == bL_switcher_cpu_original_cluster[cpu])
			
 
				+			continue;
			
 
				+		init_completion(&t->started);
			
 
				+		t->wanted_cluster = bL_switcher_cpu_original_cluster[cpu];
			
 
				+		task = bL_switcher_thread_create(cpu, t);
			
 
				+		if (!IS_ERR(task)) {
			
 
				+			wait_for_completion(&t->started);
			
 
				+			kthread_stop(task);
			
 
				+			cluster = MPIDR_AFFINITY_LEVEL(cpu_logical_map(cpu), 1);
			
 
				+			if (cluster == bL_switcher_cpu_original_cluster[cpu])
			
 
				+				continue;
			
 
				+		}
			
 
				+		/* If execution gets here, we're in trouble. */
			
 
				+		pr_crit("%s: unable to restore original cluster for CPU %d\n",
			
 
				+			__func__, cpu);
			
 
				+		pr_crit("%s: CPU %d can't be restored\n",
			
 
				+			__func__, bL_switcher_cpu_pairing[cpu]);
			
 
				+		cpumask_clear_cpu(bL_switcher_cpu_pairing[cpu],
			
 
				+				  &bL_switcher_removed_logical_cpus);
			
 
				+	}
			
 
				+
			
 
				+	bL_switcher_restore_cpus();
			
 
				+	bL_switcher_trace_trigger();
			
 
				+
			
 
				+	bL_activation_notify(BL_NOTIFY_POST_DISABLE);
			
 
				+
			
 
				+out:
			
 
				+	unlock_device_hotplug();
			
 
				+	mutex_unlock(&bL_switcher_activation_lock);
			
 
				+}
			
 
				+
			
 
				+static ssize_t bL_switcher_active_show(struct kobject *kobj,
			
 
				+		struct kobj_attribute *attr, char *buf)
			
 
				+{
			
 
				+	return sprintf(buf, "%u\n", bL_switcher_active);
			
 
				+}
			
 
				+
			
 
				+static ssize_t bL_switcher_active_store(struct kobject *kobj,
			
 
				+		struct kobj_attribute *attr, const char *buf, size_t count)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	switch (buf[0]) {
			
 
				+	case '0':
			
 
				+		bL_switcher_disable();
			
 
				+		ret = 0;
			
 
				+		break;
			
 
				+	case '1':
			
 
				+		ret = bL_switcher_enable();
			
 
				+		break;
			
 
				+	default:
			
 
				+		ret = -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	return (ret >= 0) ? count : ret;
			
 
				+}
			
 
				+
			
 
				+static ssize_t bL_switcher_trace_trigger_store(struct kobject *kobj,
			
 
				+		struct kobj_attribute *attr, const char *buf, size_t count)
			
 
				+{
			
 
				+	int ret = bL_switcher_trace_trigger();
			
 
				+
			
 
				+	return ret ? ret : count;
			
 
				+}
			
 
				+
			
 
				+static struct kobj_attribute bL_switcher_active_attr =
			
 
				+	__ATTR(active, 0644, bL_switcher_active_show, bL_switcher_active_store);
			
 
				+
			
 
				+static struct kobj_attribute bL_switcher_trace_trigger_attr =
			
 
				+	__ATTR(trace_trigger, 0200, NULL, bL_switcher_trace_trigger_store);
			
 
				+
			
 
				+static struct attribute *bL_switcher_attrs[] = {
			
 
				+	&bL_switcher_active_attr.attr,
			
 
				+	&bL_switcher_trace_trigger_attr.attr,
			
 
				+	NULL,
			
 
				+};
			
 
				+
			
 
				+static struct attribute_group bL_switcher_attr_group = {
			
 
				+	.attrs = bL_switcher_attrs,
			
 
				+};
			
 
				+
			
 
				+static struct kobject *bL_switcher_kobj;
			
 
				+
			
 
				+static int __init bL_switcher_sysfs_init(void)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	bL_switcher_kobj = kobject_create_and_add("bL_switcher", kernel_kobj);
			
 
				+	if (!bL_switcher_kobj)
			
 
				+		return -ENOMEM;
			
 
				+	ret = sysfs_create_group(bL_switcher_kobj, &bL_switcher_attr_group);
			
 
				+	if (ret)
			
 
				+		kobject_put(bL_switcher_kobj);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+#endif  /* CONFIG_SYSFS */
			
 
				+
			
 
				+bool bL_switcher_get_enabled(void)
			
 
				+{
			
 
				+	mutex_lock(&bL_switcher_activation_lock);
			
 
				+
			
 
				+	return bL_switcher_active;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(bL_switcher_get_enabled);
			
 
				+
			
 
				+void bL_switcher_put_enabled(void)
			
 
				+{
			
 
				+	mutex_unlock(&bL_switcher_activation_lock);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(bL_switcher_put_enabled);
			
 
				+
			
 
				+/*
			
 
				+ * Veto any CPU hotplug operation on those CPUs we've removed
			
 
				+ * while the switcher is active.
			
 
				+ * We're just not ready to deal with that given the trickery involved.
			
 
				+ */
			
 
				+static int bL_switcher_hotplug_callback(struct notifier_block *nfb,
			
 
				+					unsigned long action, void *hcpu)
			
 
				+{
			
 
				+	if (bL_switcher_active) {
			
 
				+		int pairing = bL_switcher_cpu_pairing[(unsigned long)hcpu];
			
 
				+		switch (action & 0xf) {
			
 
				+		case CPU_UP_PREPARE:
			
 
				+		case CPU_DOWN_PREPARE:
			
 
				+			if (pairing == -1)
			
 
				+				return NOTIFY_BAD;
			
 
				+		}
			
 
				+	}
			
 
				+	return NOTIFY_DONE;
			
 
				+}
			
 
				+
			
 
				+static bool no_bL_switcher;
			
 
				+core_param(no_bL_switcher, no_bL_switcher, bool, 0644);
			
 
				+
			
 
				+static int __init bL_switcher_init(void)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (MAX_NR_CLUSTERS != 2) {
			
 
				+		pr_err("%s: only dual cluster systems are supported\n", __func__);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	cpu_notifier(bL_switcher_hotplug_callback, 0);
			
 
				+
			
 
				+	if (!no_bL_switcher) {
			
 
				+		ret = bL_switcher_enable();
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+	}
			
 
				+
			
 
				+#ifdef CONFIG_SYSFS
			
 
				+	ret = bL_switcher_sysfs_init();
			
 
				+	if (ret)
			
 
				+		pr_err("%s: unable to create sysfs entry\n", __func__);
			
 
				+#endif
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+late_initcall(bL_switcher_init);
			
--- a/arch/arm/common/bL_switcher_dummy_if.c
+++ b/arch/arm/common/bL_switcher_dummy_if.c
@@ -0,0 +1,71 @@
 
				+/*
			
 
				+ * arch/arm/common/bL_switcher_dummy_if.c -- b.L switcher dummy interface
			
 
				+ *
			
 
				+ * Created by:	Nicolas Pitre, November 2012
			
 
				+ * Copyright:	(C) 2012-2013  Linaro Limited
			
 
				+ *
			
 
				+ * Dummy interface to user space for debugging purpose only.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/init.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/fs.h>
			
 
				+#include <linux/miscdevice.h>
			
 
				+#include <asm/uaccess.h>
			
 
				+#include <asm/bL_switcher.h>
			
 
				+
			
 
				+static ssize_t bL_switcher_write(struct file *file, const char __user *buf,
			
 
				+			size_t len, loff_t *pos)
			
 
				+{
			
 
				+	unsigned char val[3];
			
 
				+	unsigned int cpu, cluster;
			
 
				+	int ret;
			
 
				+
			
 
				+	pr_debug("%s\n", __func__);
			
 
				+
			
 
				+	if (len < 3)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if (copy_from_user(val, buf, 3))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	/* format: <cpu#>,<cluster#> */
			
 
				+	if (val[0] < '0' || val[0] > '9' ||
			
 
				+	    val[1] != ',' ||
			
 
				+	    val[2] < '0' || val[2] > '1')
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	cpu = val[0] - '0';
			
 
				+	cluster = val[2] - '0';
			
 
				+	ret = bL_switch_request(cpu, cluster);
			
 
				+
			
 
				+	return ret ? : len;
			
 
				+}
			
 
				+
			
 
				+static const struct file_operations bL_switcher_fops = {
			
 
				+	.write		= bL_switcher_write,
			
 
				+	.owner	= THIS_MODULE,
			
 
				+};
			
 
				+
			
 
				+static struct miscdevice bL_switcher_device = {
			
 
				+	MISC_DYNAMIC_MINOR,
			
 
				+	"b.L_switcher",
			
 
				+	&bL_switcher_fops
			
 
				+};
			
 
				+
			
 
				+static int __init bL_switcher_dummy_if_init(void)
			
 
				+{
			
 
				+	return misc_register(&bL_switcher_device);
			
 
				+}
			
 
				+
			
 
				+static void __exit bL_switcher_dummy_if_exit(void)
			
 
				+{
			
 
				+	misc_deregister(&bL_switcher_device);
			
 
				+}
			
 
				+
			
 
				+module_init(bL_switcher_dummy_if_init);
			
 
				+module_exit(bL_switcher_dummy_if_exit);
			
--- a/arch/arm/common/mcpm_entry.c
+++ b/arch/arm/common/mcpm_entry.c
@@ -27,6 +27,18 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr)
 
				 	sync_cache_w(&mcpm_entry_vectors[cluster][cpu]);
			
 
				 }
			
 
				 
			
 
				+extern unsigned long mcpm_entry_early_pokes[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER][2];
			
 
				+
			
 
				+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
			
 
				+			 unsigned long poke_phys_addr, unsigned long poke_val)
			
 
				+{
			
 
				+	unsigned long *poke = &mcpm_entry_early_pokes[cluster][cpu][0];
			
 
				+	poke[0] = poke_phys_addr;
			
 
				+	poke[1] = poke_val;
			
 
				+	__cpuc_flush_dcache_area((void *)poke, 8);
			
 
				+	outer_clean_range(__pa(poke), __pa(poke + 2));
			
 
				+}
			
 
				+
			
 
				 static const struct mcpm_platform_ops *platform_ops;
			
 
				 
			
 
				 int __init mcpm_platform_register(const struct mcpm_platform_ops *ops)
			
@@ -90,6 +102,21 @@ void mcpm_cpu_power_down(void)
 
				 	BUG();
			
 
				 }
			
 
				 
			
 
				+int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (WARN_ON_ONCE(!platform_ops || !platform_ops->power_down_finish))
			
 
				+		return -EUNATCH;
			
 
				+
			
 
				+	ret = platform_ops->power_down_finish(cpu, cluster);
			
 
				+	if (ret)
			
 
				+		pr_warn("%s: cpu %u, cluster %u failed to power down (%d)\n",
			
 
				+			__func__, cpu, cluster, ret);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 void mcpm_cpu_suspend(u64 expected_residency)
			
 
				 {
			
 
				 	phys_reset_t phys_reset;
			
--- a/arch/arm/common/mcpm_head.S
+++ b/arch/arm/common/mcpm_head.S
@@ -15,6 +15,7 @@
 
				 
			
 
				 #include <linux/linkage.h>
			
 
				 #include <asm/mcpm.h>
			
 
				+#include <asm/assembler.h>
			
 
				 
			
 
				 #include "vlock.h"
			
 
				 
			
@@ -47,6 +48,7 @@
 
				 
			
 
				 ENTRY(mcpm_entry_point)
			
 
				 
			
 
				+ ARM_BE8(setend        be)
			
 
				  THUMB(	adr	r12, BSYM(1f)	)
			
 
				  THUMB(	bx	r12		)
			
 
				  THUMB(	.thumb			)
			
@@ -71,12 +73,19 @@ ENTRY(mcpm_entry_point)
 
				 	 * position independent way.
			
 
				 	 */
			
 
				 	adr	r5, 3f
			
 
				-	ldmia	r5, {r6, r7, r8, r11}
			
 
				+	ldmia	r5, {r0, r6, r7, r8, r11}
			
 
				+	add	r0, r5, r0			@ r0 = mcpm_entry_early_pokes
			
 
				 	add	r6, r5, r6			@ r6 = mcpm_entry_vectors
			
 
				 	ldr	r7, [r5, r7]			@ r7 = mcpm_power_up_setup_phys
			
 
				 	add	r8, r5, r8			@ r8 = mcpm_sync
			
 
				 	add	r11, r5, r11			@ r11 = first_man_locks
			
 
				 
			
 
				+	@ Perform an early poke, if any
			
 
				+	add	r0, r0, r4, lsl #3
			
 
				+	ldmia	r0, {r0, r1}
			
 
				+	teq	r0, #0
			
 
				+	strne	r1, [r0]
			
 
				+
			
 
				 	mov	r0, #MCPM_SYNC_CLUSTER_SIZE
			
 
				 	mla	r8, r0, r10, r8			@ r8 = sync cluster base
			
 
				 
			
@@ -195,7 +204,8 @@ mcpm_entry_gated:
 
				 
			
 
				 	.align	2
			
 
				 
			
 
				-3:	.word	mcpm_entry_vectors - .
			
 
				+3:	.word	mcpm_entry_early_pokes - .
			
 
				+	.word	mcpm_entry_vectors - 3b
			
 
				 	.word	mcpm_power_up_setup_phys - 3b
			
 
				 	.word	mcpm_sync - 3b
			
 
				 	.word	first_man_locks - 3b
			
@@ -214,6 +224,10 @@ first_man_locks:
 
				 ENTRY(mcpm_entry_vectors)
			
 
				 	.space	4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
			
 
				 
			
 
				+	.type	mcpm_entry_early_pokes, #object
			
 
				+ENTRY(mcpm_entry_early_pokes)
			
 
				+	.space	8 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
			
 
				+
			
 
				 	.type	mcpm_power_up_setup_phys, #object
			
 
				 ENTRY(mcpm_power_up_setup_phys)
			
 
				 	.space  4		@ set by mcpm_sync_init()
			
--- a/arch/arm/common/mcpm_platsmp.c
+++ b/arch/arm/common/mcpm_platsmp.c
@@ -19,14 +19,23 @@
 
				 #include <asm/smp.h>
			
 
				 #include <asm/smp_plat.h>
			
 
				 
			
 
				+static void cpu_to_pcpu(unsigned int cpu,
			
 
				+			unsigned int *pcpu, unsigned int *pcluster)
			
 
				+{
			
 
				+	unsigned int mpidr;
			
 
				+
			
 
				+	mpidr = cpu_logical_map(cpu);
			
 
				+	*pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
			
 
				+	*pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
			
 
				+}
			
 
				+
			
 
				 static int mcpm_boot_secondary(unsigned int cpu, struct task_struct *idle)
			
 
				 {
			
 
				-	unsigned int mpidr, pcpu, pcluster, ret;
			
 
				+	unsigned int pcpu, pcluster, ret;
			
 
				 	extern void secondary_startup(void);
			
 
				 
			
 
				-	mpidr = cpu_logical_map(cpu);
			
 
				-	pcpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
			
 
				-	pcluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
			
 
				+	cpu_to_pcpu(cpu, &pcpu, &pcluster);
			
 
				+
			
 
				 	pr_debug("%s: logical CPU %d is physical CPU %d cluster %d\n",
			
 
				 		 __func__, cpu, pcpu, pcluster);
			
 
				 
			
@@ -47,6 +56,15 @@ static void mcpm_secondary_init(unsigned int cpu)
 
				 
			
 
				 #ifdef CONFIG_HOTPLUG_CPU
			
 
				 
			
 
				+static int mcpm_cpu_kill(unsigned int cpu)
			
 
				+{
			
 
				+	unsigned int pcpu, pcluster;
			
 
				+
			
 
				+	cpu_to_pcpu(cpu, &pcpu, &pcluster);
			
 
				+
			
 
				+	return !mcpm_cpu_power_down_finish(pcpu, pcluster);
			
 
				+}
			
 
				+
			
 
				 static int mcpm_cpu_disable(unsigned int cpu)
			
 
				 {
			
 
				 	/*
			
@@ -73,6 +91,7 @@ static struct smp_operations __initdata mcpm_smp_ops = {
 
				 	.smp_boot_secondary	= mcpm_boot_secondary,
			
 
				 	.smp_secondary_init	= mcpm_secondary_init,
			
 
				 #ifdef CONFIG_HOTPLUG_CPU
			
 
				+	.cpu_kill		= mcpm_cpu_kill,
			
 
				 	.cpu_disable		= mcpm_cpu_disable,
			
 
				 	.cpu_die		= mcpm_cpu_die,
			
 
				 #endif
			
--- a/arch/arm/common/timer-sp.c
+++ b/arch/arm/common/timer-sp.c
@@ -175,7 +175,7 @@ static struct clock_event_device sp804_clockevent = {
 
				 
			
 
				 static struct irqaction sp804_timer_irq = {
			
 
				 	.name		= "timer",
			
 
				-	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
			
 
				+	.flags		= IRQF_TIMER | IRQF_IRQPOLL,
			
 
				 	.handler	= sp804_timer_interrupt,
			
 
				 	.dev_id		= &sp804_clockevent,
			
 
				 };
			
--- a/arch/arm/configs/h3600_defconfig
+++ b/arch/arm/configs/h3600_defconfig
@@ -1,5 +1,6 @@
 
				-CONFIG_EXPERIMENTAL=y
			
 
				 CONFIG_SYSVIPC=y
			
 
				+CONFIG_NO_HZ_IDLE=y
			
 
				+CONFIG_HIGH_RES_TIMERS=y
			
 
				 CONFIG_LOG_BUF_SHIFT=14
			
 
				 CONFIG_BLK_DEV_INITRD=y
			
 
				 CONFIG_MODULES=y
			
@@ -11,11 +12,11 @@ CONFIG_ARCH_SA1100=y
 
				 CONFIG_SA1100_H3600=y
			
 
				 CONFIG_PCCARD=y
			
 
				 CONFIG_PCMCIA_SA1100=y
			
 
				+CONFIG_PREEMPT=y
			
 
				 CONFIG_ZBOOT_ROM_TEXT=0x0
			
 
				 CONFIG_ZBOOT_ROM_BSS=0x0
			
 
				 # CONFIG_CPU_FREQ_STAT is not set
			
 
				 CONFIG_FPE_NWFPE=y
			
 
				-CONFIG_PM=y
			
 
				 CONFIG_NET=y
			
 
				 CONFIG_UNIX=y
			
 
				 CONFIG_INET=y
			
@@ -24,13 +25,10 @@ CONFIG_IRDA=m
 
				 CONFIG_IRLAN=m
			
 
				 CONFIG_IRNET=m
			
 
				 CONFIG_IRCOMM=m
			
 
				-CONFIG_SA1100_FIR=m
			
 
				 # CONFIG_WIRELESS is not set
			
 
				 CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
			
 
				 CONFIG_MTD=y
			
 
				-CONFIG_MTD_PARTITIONS=y
			
 
				 CONFIG_MTD_REDBOOT_PARTS=y
			
 
				-CONFIG_MTD_CHAR=y
			
 
				 CONFIG_MTD_BLOCK=y
			
 
				 CONFIG_MTD_CFI=y
			
 
				 CONFIG_MTD_CFI_ADV_OPTIONS=y
			
@@ -41,19 +39,15 @@ CONFIG_MTD_SA1100=y
 
				 CONFIG_BLK_DEV_LOOP=m
			
 
				 CONFIG_BLK_DEV_RAM=y
			
 
				 CONFIG_BLK_DEV_RAM_SIZE=8192
			
 
				-# CONFIG_MISC_DEVICES is not set
			
 
				 CONFIG_IDE=y
			
 
				 CONFIG_BLK_DEV_IDECS=y
			
 
				 CONFIG_NETDEVICES=y
			
 
				-# CONFIG_NETDEV_1000 is not set
			
 
				-# CONFIG_NETDEV_10000 is not set
			
 
				-# CONFIG_WLAN is not set
			
 
				-CONFIG_NET_PCMCIA=y
			
 
				 CONFIG_PCMCIA_PCNET=y
			
 
				 CONFIG_PPP=m
			
 
				-CONFIG_PPP_ASYNC=m
			
 
				-CONFIG_PPP_DEFLATE=m
			
 
				 CONFIG_PPP_BSDCOMP=m
			
 
				+CONFIG_PPP_DEFLATE=m
			
 
				+CONFIG_PPP_ASYNC=m
			
 
				+# CONFIG_WLAN is not set
			
 
				 # CONFIG_KEYBOARD_ATKBD is not set
			
 
				 CONFIG_KEYBOARD_GPIO=y
			
 
				 # CONFIG_INPUT_MOUSE is not set
			
@@ -64,8 +58,6 @@ CONFIG_SERIAL_SA1100_CONSOLE=y
 
				 # CONFIG_HWMON is not set
			
 
				 CONFIG_FB=y
			
 
				 CONFIG_FB_SA1100=y
			
 
				-# CONFIG_VGA_CONSOLE is not set
			
 
				-# CONFIG_HID_SUPPORT is not set
			
 
				 # CONFIG_USB_SUPPORT is not set
			
 
				 CONFIG_EXT2_FS=y
			
 
				 CONFIG_MSDOS_FS=m
			
@@ -74,6 +66,4 @@ CONFIG_JFFS2_FS=y
 
				 CONFIG_CRAMFS=m
			
 
				 CONFIG_NFS_FS=y
			
 
				 CONFIG_NFSD=m
			
 
				-CONFIG_SMB_FS=m
			
 
				 CONFIG_NLS=y
			
 
				-# CONFIG_RCU_CPU_STALL_DETECTOR is not set
			
--- a/arch/arm/crypto/.gitignore
+++ b/arch/arm/crypto/.gitignore
@@ -0,0 +1 @@
 
				+aesbs-core.S
			
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -3,7 +3,17 @@
 
				 #
			
 
				 
			
 
				 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
			
 
				+obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
			
 
				 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
			
 
				 
			
 
				-aes-arm-y  := aes-armv4.o aes_glue.o
			
 
				-sha1-arm-y := sha1-armv4-large.o sha1_glue.o
			
 
				+aes-arm-y	:= aes-armv4.o aes_glue.o
			
 
				+aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
			
 
				+sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
			
 
				+
			
 
				+quiet_cmd_perl = PERL    $@
			
 
				+      cmd_perl = $(PERL) $(<) > $(@)
			
 
				+
			
 
				+$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
			
 
				+	$(call cmd,perl)
			
 
				+
			
 
				+.PRECIOUS: $(obj)/aesbs-core.S
			
--- a/arch/arm/crypto/aes_glue.c
+++ b/arch/arm/crypto/aes_glue.c
@@ -6,22 +6,12 @@
 
				 #include <linux/crypto.h>
			
 
				 #include <crypto/aes.h>
			
 
				 
			
 
				-#define AES_MAXNR 14
			
 
				+#include "aes_glue.h"
			
 
				 
			
 
				-typedef struct {
			
 
				-	unsigned int rd_key[4 *(AES_MAXNR + 1)];
			
 
				-	int rounds;
			
 
				-} AES_KEY;
			
 
				-
			
 
				-struct AES_CTX {
			
 
				-	AES_KEY enc_key;
			
 
				-	AES_KEY dec_key;
			
 
				-};
			
 
				-
			
 
				-asmlinkage void AES_encrypt(const u8 *in, u8 *out, AES_KEY *ctx);
			
 
				-asmlinkage void AES_decrypt(const u8 *in, u8 *out, AES_KEY *ctx);
			
 
				-asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
			
 
				-asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key);
			
 
				+EXPORT_SYMBOL(AES_encrypt);
			
 
				+EXPORT_SYMBOL(AES_decrypt);
			
 
				+EXPORT_SYMBOL(private_AES_set_encrypt_key);
			
 
				+EXPORT_SYMBOL(private_AES_set_decrypt_key);
			
 
				 
			
 
				 static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
			
 
				 {
			
@@ -81,7 +71,7 @@ static struct crypto_alg aes_alg = {
 
				 		.cipher	= {
			
 
				 			.cia_min_keysize	= AES_MIN_KEY_SIZE,
			
 
				 			.cia_max_keysize	= AES_MAX_KEY_SIZE,
			
 
				-			.cia_setkey			= aes_set_key,
			
 
				+			.cia_setkey		= aes_set_key,
			
 
				 			.cia_encrypt		= aes_encrypt,
			
 
				 			.cia_decrypt		= aes_decrypt
			
 
				 		}
			
--- a/arch/arm/crypto/aes_glue.h
+++ b/arch/arm/crypto/aes_glue.h
@@ -0,0 +1,19 @@
 
				+
			
 
				+#define AES_MAXNR 14
			
 
				+
			
 
				+struct AES_KEY {
			
 
				+	unsigned int rd_key[4 * (AES_MAXNR + 1)];
			
 
				+	int rounds;
			
 
				+};
			
 
				+
			
 
				+struct AES_CTX {
			
 
				+	struct AES_KEY enc_key;
			
 
				+	struct AES_KEY dec_key;
			
 
				+};
			
 
				+
			
 
				+asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
			
 
				+asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
			
 
				+asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
			
 
				+					   const int bits, struct AES_KEY *key);
			
 
				+asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
			
 
				+					   const int bits, struct AES_KEY *key);
			
--- a/arch/arm/crypto/aesbs-core.S_shipped
+++ b/arch/arm/crypto/aesbs-core.S_shipped
@@ -0,0 +1,2544 @@
 
				+
			
 
				+@ ====================================================================
			
 
				+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
			
 
				+@ project. The module is, however, dual licensed under OpenSSL and
			
 
				+@ CRYPTOGAMS licenses depending on where you obtain it. For further
			
 
				+@ details see http://www.openssl.org/~appro/cryptogams/.
			
 
				+@
			
 
				+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
			
 
				+@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
			
 
				+@ granted.
			
 
				+@ ====================================================================
			
 
				+
			
 
				+@ Bit-sliced AES for ARM NEON
			
 
				+@
			
 
				+@ February 2012.
			
 
				+@
			
 
				+@ This implementation is direct adaptation of bsaes-x86_64 module for
			
 
				+@ ARM NEON. Except that this module is endian-neutral [in sense that
			
 
				+@ it can be compiled for either endianness] by courtesy of vld1.8's
			
 
				+@ neutrality. Initial version doesn't implement interface to OpenSSL,
			
 
				+@ only low-level primitives and unsupported entry points, just enough
			
 
				+@ to collect performance results, which for Cortex-A8 core are:
			
 
				+@
			
 
				+@ encrypt	19.5 cycles per byte processed with 128-bit key
			
 
				+@ decrypt	22.1 cycles per byte processed with 128-bit key
			
 
				+@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
			
 
				+@
			
 
				+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
			
 
				+@ which is [much] worse than anticipated (for further details see
			
 
				+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
			
 
				+@
			
 
				+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
			
 
				+@ manages in 20.0 cycles].
			
 
				+@
			
 
				+@ When comparing to x86_64 results keep in mind that NEON unit is
			
 
				+@ [mostly] single-issue and thus can't [fully] benefit from
			
 
				+@ instruction-level parallelism. And when comparing to aes-armv4
			
 
				+@ results keep in mind key schedule conversion overhead (see
			
 
				+@ bsaes-x86_64.pl for further details)...
			
 
				+@
			
 
				+@						<appro@openssl.org>
			
 
				+
			
 
				+@ April-August 2013
			
 
				+@
			
 
				+@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
			
 
				+@
			
 
				+@					<ard.biesheuvel@linaro.org>
			
 
				+
			
 
				+#ifndef __KERNEL__
			
 
				+# include "arm_arch.h"
			
 
				+
			
 
				+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
			
 
				+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
			
 
				+# define VFP_ABI_FRAME	0x40
			
 
				+#else
			
 
				+# define VFP_ABI_PUSH
			
 
				+# define VFP_ABI_POP
			
 
				+# define VFP_ABI_FRAME	0
			
 
				+# define BSAES_ASM_EXTENDED_KEY
			
 
				+# define XTS_CHAIN_TWEAK
			
 
				+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __thumb__
			
 
				+# define adrl adr
			
 
				+#endif
			
 
				+
			
 
				+#if __ARM_ARCH__>=7
			
 
				+.text
			
 
				+.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
			
 
				+#ifdef __thumb2__
			
 
				+.thumb
			
 
				+#else
			
 
				+.code   32
			
 
				+#endif
			
 
				+
			
 
				+.fpu	neon
			
 
				+
			
 
				+.type	_bsaes_decrypt8,%function
			
 
				+.align	4
			
 
				+_bsaes_decrypt8:
			
 
				+	adr	r6,_bsaes_decrypt8
			
 
				+	vldmia	r4!, {q9}		@ round 0 key
			
 
				+	add	r6,r6,#.LM0ISR-_bsaes_decrypt8
			
 
				+
			
 
				+	vldmia	r6!, {q8}		@ .LM0ISR
			
 
				+	veor	q10, q0, q9	@ xor with round0 key
			
 
				+	veor	q11, q1, q9
			
 
				+	 vtbl.8	d0, {q10}, d16
			
 
				+	 vtbl.8	d1, {q10}, d17
			
 
				+	veor	q12, q2, q9
			
 
				+	 vtbl.8	d2, {q11}, d16
			
 
				+	 vtbl.8	d3, {q11}, d17
			
 
				+	veor	q13, q3, q9
			
 
				+	 vtbl.8	d4, {q12}, d16
			
 
				+	 vtbl.8	d5, {q12}, d17
			
 
				+	veor	q14, q4, q9
			
 
				+	 vtbl.8	d6, {q13}, d16
			
 
				+	 vtbl.8	d7, {q13}, d17
			
 
				+	veor	q15, q5, q9
			
 
				+	 vtbl.8	d8, {q14}, d16
			
 
				+	 vtbl.8	d9, {q14}, d17
			
 
				+	veor	q10, q6, q9
			
 
				+	 vtbl.8	d10, {q15}, d16
			
 
				+	 vtbl.8	d11, {q15}, d17
			
 
				+	veor	q11, q7, q9
			
 
				+	 vtbl.8	d12, {q10}, d16
			
 
				+	 vtbl.8	d13, {q10}, d17
			
 
				+	 vtbl.8	d14, {q11}, d16
			
 
				+	 vtbl.8	d15, {q11}, d17
			
 
				+	vmov.i8	q8,#0x55			@ compose .LBS0
			
 
				+	vmov.i8	q9,#0x33			@ compose .LBS1
			
 
				+	vshr.u64	q10, q6, #1
			
 
				+	 vshr.u64	q11, q4, #1
			
 
				+	veor		q10, q10, q7
			
 
				+	 veor		q11, q11, q5
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q7, q7, q10
			
 
				+	vshl.u64	q10, q10, #1
			
 
				+	 veor		q5, q5, q11
			
 
				+	 vshl.u64	q11, q11, #1
			
 
				+	veor		q6, q6, q10
			
 
				+	 veor		q4, q4, q11
			
 
				+	vshr.u64	q10, q2, #1
			
 
				+	 vshr.u64	q11, q0, #1
			
 
				+	veor		q10, q10, q3
			
 
				+	 veor		q11, q11, q1
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q3, q3, q10
			
 
				+	vshl.u64	q10, q10, #1
			
 
				+	 veor		q1, q1, q11
			
 
				+	 vshl.u64	q11, q11, #1
			
 
				+	veor		q2, q2, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	vmov.i8	q8,#0x0f			@ compose .LBS2
			
 
				+	vshr.u64	q10, q5, #2
			
 
				+	 vshr.u64	q11, q4, #2
			
 
				+	veor		q10, q10, q7
			
 
				+	 veor		q11, q11, q6
			
 
				+	vand		q10, q10, q9
			
 
				+	 vand		q11, q11, q9
			
 
				+	veor		q7, q7, q10
			
 
				+	vshl.u64	q10, q10, #2
			
 
				+	 veor		q6, q6, q11
			
 
				+	 vshl.u64	q11, q11, #2
			
 
				+	veor		q5, q5, q10
			
 
				+	 veor		q4, q4, q11
			
 
				+	vshr.u64	q10, q1, #2
			
 
				+	 vshr.u64	q11, q0, #2
			
 
				+	veor		q10, q10, q3
			
 
				+	 veor		q11, q11, q2
			
 
				+	vand		q10, q10, q9
			
 
				+	 vand		q11, q11, q9
			
 
				+	veor		q3, q3, q10
			
 
				+	vshl.u64	q10, q10, #2
			
 
				+	 veor		q2, q2, q11
			
 
				+	 vshl.u64	q11, q11, #2
			
 
				+	veor		q1, q1, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	vshr.u64	q10, q3, #4
			
 
				+	 vshr.u64	q11, q2, #4
			
 
				+	veor		q10, q10, q7
			
 
				+	 veor		q11, q11, q6
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q7, q7, q10
			
 
				+	vshl.u64	q10, q10, #4
			
 
				+	 veor		q6, q6, q11
			
 
				+	 vshl.u64	q11, q11, #4
			
 
				+	veor		q3, q3, q10
			
 
				+	 veor		q2, q2, q11
			
 
				+	vshr.u64	q10, q1, #4
			
 
				+	 vshr.u64	q11, q0, #4
			
 
				+	veor		q10, q10, q5
			
 
				+	 veor		q11, q11, q4
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q5, q5, q10
			
 
				+	vshl.u64	q10, q10, #4
			
 
				+	 veor		q4, q4, q11
			
 
				+	 vshl.u64	q11, q11, #4
			
 
				+	veor		q1, q1, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	sub	r5,r5,#1
			
 
				+	b	.Ldec_sbox
			
 
				+.align	4
			
 
				+.Ldec_loop:
			
 
				+	vldmia	r4!, {q8-q11}
			
 
				+	veor	q8, q8, q0
			
 
				+	veor	q9, q9, q1
			
 
				+	vtbl.8	d0, {q8}, d24
			
 
				+	vtbl.8	d1, {q8}, d25
			
 
				+	vldmia	r4!, {q8}
			
 
				+	veor	q10, q10, q2
			
 
				+	vtbl.8	d2, {q9}, d24
			
 
				+	vtbl.8	d3, {q9}, d25
			
 
				+	vldmia	r4!, {q9}
			
 
				+	veor	q11, q11, q3
			
 
				+	vtbl.8	d4, {q10}, d24
			
 
				+	vtbl.8	d5, {q10}, d25
			
 
				+	vldmia	r4!, {q10}
			
 
				+	vtbl.8	d6, {q11}, d24
			
 
				+	vtbl.8	d7, {q11}, d25
			
 
				+	vldmia	r4!, {q11}
			
 
				+	veor	q8, q8, q4
			
 
				+	veor	q9, q9, q5
			
 
				+	vtbl.8	d8, {q8}, d24
			
 
				+	vtbl.8	d9, {q8}, d25
			
 
				+	veor	q10, q10, q6
			
 
				+	vtbl.8	d10, {q9}, d24
			
 
				+	vtbl.8	d11, {q9}, d25
			
 
				+	veor	q11, q11, q7
			
 
				+	vtbl.8	d12, {q10}, d24
			
 
				+	vtbl.8	d13, {q10}, d25
			
 
				+	vtbl.8	d14, {q11}, d24
			
 
				+	vtbl.8	d15, {q11}, d25
			
 
				+.Ldec_sbox:
			
 
				+	 veor	q1, q1, q4
			
 
				+	veor	q3, q3, q4
			
 
				+
			
 
				+	veor	q4, q4, q7
			
 
				+	 veor	q1, q1, q6
			
 
				+	veor	q2, q2, q7
			
 
				+	veor	q6, q6, q4
			
 
				+
			
 
				+	veor	q0, q0, q1
			
 
				+	veor	q2, q2, q5
			
 
				+	 veor	q7, q7, q6
			
 
				+	veor	q3, q3, q0
			
 
				+	veor	q5, q5, q0
			
 
				+	veor	q1, q1, q3
			
 
				+	veor	q11, q3, q0
			
 
				+	veor	q10, q7, q4
			
 
				+	veor	q9, q1, q6
			
 
				+	veor	q13, q4, q0
			
 
				+	 vmov	q8, q10
			
 
				+	veor	q12, q5, q2
			
 
				+
			
 
				+	vorr	q10, q10, q9
			
 
				+	veor	q15, q11, q8
			
 
				+	vand	q14, q11, q12
			
 
				+	vorr	q11, q11, q12
			
 
				+	veor	q12, q12, q9
			
 
				+	vand	q8, q8, q9
			
 
				+	veor	q9, q6, q2
			
 
				+	vand	q15, q15, q12
			
 
				+	vand	q13, q13, q9
			
 
				+	veor	q9, q3, q7
			
 
				+	veor	q12, q1, q5
			
 
				+	veor	q11, q11, q13
			
 
				+	veor	q10, q10, q13
			
 
				+	vand	q13, q9, q12
			
 
				+	vorr	q9, q9, q12
			
 
				+	veor	q11, q11, q15
			
 
				+	veor	q8, q8, q13
			
 
				+	veor	q10, q10, q14
			
 
				+	veor	q9, q9, q15
			
 
				+	veor	q8, q8, q14
			
 
				+	vand	q12, q4, q6
			
 
				+	veor	q9, q9, q14
			
 
				+	vand	q13, q0, q2
			
 
				+	vand	q14, q7, q1
			
 
				+	vorr	q15, q3, q5
			
 
				+	veor	q11, q11, q12
			
 
				+	veor	q9, q9, q14
			
 
				+	veor	q8, q8, q15
			
 
				+	veor	q10, q10, q13
			
 
				+
			
 
				+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
			
 
				+
			
 
				+	@ new smaller inversion
			
 
				+
			
 
				+	vand	q14, q11, q9
			
 
				+	vmov	q12, q8
			
 
				+
			
 
				+	veor	q13, q10, q14
			
 
				+	veor	q15, q8, q14
			
 
				+	veor	q14, q8, q14	@ q14=q15
			
 
				+
			
 
				+	vbsl	q13, q9, q8
			
 
				+	vbsl	q15, q11, q10
			
 
				+	veor	q11, q11, q10
			
 
				+
			
 
				+	vbsl	q12, q13, q14
			
 
				+	vbsl	q8, q14, q13
			
 
				+
			
 
				+	vand	q14, q12, q15
			
 
				+	veor	q9, q9, q8
			
 
				+
			
 
				+	veor	q14, q14, q11
			
 
				+	veor	q12, q5, q2
			
 
				+	veor	q8, q1, q6
			
 
				+	veor 	q10, q15, q14
			
 
				+	vand	q10, q10, q5
			
 
				+	veor	q5, q5, q1
			
 
				+	vand	q11, q1, q15
			
 
				+	vand	q5, q5, q14
			
 
				+	veor	q1, q11, q10
			
 
				+	veor	q5, q5, q11
			
 
				+	veor	q15, q15, q13
			
 
				+	veor	q14, q14, q9
			
 
				+	veor	q11, q15, q14
			
 
				+	 veor 	q10, q13, q9
			
 
				+	vand	q11, q11, q12
			
 
				+	 vand	q10, q10, q2
			
 
				+	veor	q12, q12, q8
			
 
				+	 veor	q2, q2, q6
			
 
				+	vand	q8, q8, q15
			
 
				+	 vand	q6, q6, q13
			
 
				+	vand	q12, q12, q14
			
 
				+	 vand	q2, q2, q9
			
 
				+	veor	q8, q8, q12
			
 
				+	 veor	q2, q2, q6
			
 
				+	veor	q12, q12, q11
			
 
				+	 veor	q6, q6, q10
			
 
				+	veor	q5, q5, q12
			
 
				+	veor	q2, q2, q12
			
 
				+	veor	q1, q1, q8
			
 
				+	veor	q6, q6, q8
			
 
				+
			
 
				+	veor	q12, q3, q0
			
 
				+	veor	q8, q7, q4
			
 
				+	veor	q11, q15, q14
			
 
				+	 veor 	q10, q13, q9
			
 
				+	vand	q11, q11, q12
			
 
				+	 vand	q10, q10, q0
			
 
				+	veor	q12, q12, q8
			
 
				+	 veor	q0, q0, q4
			
 
				+	vand	q8, q8, q15
			
 
				+	 vand	q4, q4, q13
			
 
				+	vand	q12, q12, q14
			
 
				+	 vand	q0, q0, q9
			
 
				+	veor	q8, q8, q12
			
 
				+	 veor	q0, q0, q4
			
 
				+	veor	q12, q12, q11
			
 
				+	 veor	q4, q4, q10
			
 
				+	veor	q15, q15, q13
			
 
				+	veor	q14, q14, q9
			
 
				+	veor 	q10, q15, q14
			
 
				+	vand	q10, q10, q3
			
 
				+	veor	q3, q3, q7
			
 
				+	vand	q11, q7, q15
			
 
				+	vand	q3, q3, q14
			
 
				+	veor	q7, q11, q10
			
 
				+	veor	q3, q3, q11
			
 
				+	veor	q3, q3, q12
			
 
				+	veor	q0, q0, q12
			
 
				+	veor	q7, q7, q8
			
 
				+	veor	q4, q4, q8
			
 
				+	veor	q1, q1, q7
			
 
				+	veor	q6, q6, q5
			
 
				+
			
 
				+	veor	q4, q4, q1
			
 
				+	veor	q2, q2, q7
			
 
				+	veor	q5, q5, q7
			
 
				+	veor	q4, q4, q2
			
 
				+	 veor 	q7, q7, q0
			
 
				+	veor	q4, q4, q5
			
 
				+	 veor	q3, q3, q6
			
 
				+	 veor	q6, q6, q1
			
 
				+	veor	q3, q3, q4
			
 
				+
			
 
				+	veor	q4, q4, q0
			
 
				+	veor	q7, q7, q3
			
 
				+	subs	r5,r5,#1
			
 
				+	bcc	.Ldec_done
			
 
				+	@ multiplication by 0x05-0x00-0x04-0x00
			
 
				+	vext.8	q8, q0, q0, #8
			
 
				+	vext.8	q14, q3, q3, #8
			
 
				+	vext.8	q15, q5, q5, #8
			
 
				+	veor	q8, q8, q0
			
 
				+	vext.8	q9, q1, q1, #8
			
 
				+	veor	q14, q14, q3
			
 
				+	vext.8	q10, q6, q6, #8
			
 
				+	veor	q15, q15, q5
			
 
				+	vext.8	q11, q4, q4, #8
			
 
				+	veor	q9, q9, q1
			
 
				+	vext.8	q12, q2, q2, #8
			
 
				+	veor	q10, q10, q6
			
 
				+	vext.8	q13, q7, q7, #8
			
 
				+	veor	q11, q11, q4
			
 
				+	veor	q12, q12, q2
			
 
				+	veor	q13, q13, q7
			
 
				+
			
 
				+	 veor	q0, q0, q14
			
 
				+	 veor	q1, q1, q14
			
 
				+	 veor	q6, q6, q8
			
 
				+	 veor	q2, q2, q10
			
 
				+	 veor	q4, q4, q9
			
 
				+	 veor	q1, q1, q15
			
 
				+	 veor	q6, q6, q15
			
 
				+	 veor	q2, q2, q14
			
 
				+	 veor	q7, q7, q11
			
 
				+	 veor	q4, q4, q14
			
 
				+	 veor	q3, q3, q12
			
 
				+	 veor	q2, q2, q15
			
 
				+	 veor	q7, q7, q15
			
 
				+	 veor	q5, q5, q13
			
 
				+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
			
 
				+	vext.8	q9, q1, q1, #12
			
 
				+	 veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
			
 
				+	vext.8	q10, q6, q6, #12
			
 
				+	 veor	q1, q1, q9
			
 
				+	vext.8	q11, q4, q4, #12
			
 
				+	 veor	q6, q6, q10
			
 
				+	vext.8	q12, q2, q2, #12
			
 
				+	 veor	q4, q4, q11
			
 
				+	vext.8	q13, q7, q7, #12
			
 
				+	 veor	q2, q2, q12
			
 
				+	vext.8	q14, q3, q3, #12
			
 
				+	 veor	q7, q7, q13
			
 
				+	vext.8	q15, q5, q5, #12
			
 
				+	 veor	q3, q3, q14
			
 
				+
			
 
				+	veor	q9, q9, q0
			
 
				+	 veor	q5, q5, q15
			
 
				+	 vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
			
 
				+	veor	q10, q10, q1
			
 
				+	veor	q8, q8, q5
			
 
				+	veor	q9, q9, q5
			
 
				+	 vext.8	q1, q1, q1, #8
			
 
				+	veor	q13, q13, q2
			
 
				+	 veor	q0, q0, q8
			
 
				+	veor	q14, q14, q7
			
 
				+	 veor	q1, q1, q9
			
 
				+	 vext.8	q8, q2, q2, #8
			
 
				+	veor	q12, q12, q4
			
 
				+	 vext.8	q9, q7, q7, #8
			
 
				+	veor	q15, q15, q3
			
 
				+	 vext.8	q2, q4, q4, #8
			
 
				+	veor	q11, q11, q6
			
 
				+	 vext.8	q7, q5, q5, #8
			
 
				+	veor	q12, q12, q5
			
 
				+	 vext.8	q4, q3, q3, #8
			
 
				+	veor	q11, q11, q5
			
 
				+	 vext.8	q3, q6, q6, #8
			
 
				+	veor	q5, q9, q13
			
 
				+	veor	q11, q11, q2
			
 
				+	veor	q7, q7, q15
			
 
				+	veor	q6, q4, q14
			
 
				+	veor	q4, q8, q12
			
 
				+	veor	q2, q3, q10
			
 
				+	vmov	q3, q11
			
 
				+	 @ vmov	q5, q9
			
 
				+	vldmia	r6, {q12}		@ .LISR
			
 
				+	ite	eq				@ Thumb2 thing, sanity check in ARM
			
 
				+	addeq	r6,r6,#0x10
			
 
				+	bne	.Ldec_loop
			
 
				+	vldmia	r6, {q12}		@ .LISRM0
			
 
				+	b	.Ldec_loop
			
 
				+.align	4
			
 
				+.Ldec_done:
			
 
				+	vmov.i8	q8,#0x55			@ compose .LBS0
			
 
				+	vmov.i8	q9,#0x33			@ compose .LBS1
			
 
				+	vshr.u64	q10, q3, #1
			
 
				+	 vshr.u64	q11, q2, #1
			
 
				+	veor		q10, q10, q5
			
 
				+	 veor		q11, q11, q7
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q5, q5, q10
			
 
				+	vshl.u64	q10, q10, #1
			
 
				+	 veor		q7, q7, q11
			
 
				+	 vshl.u64	q11, q11, #1
			
 
				+	veor		q3, q3, q10
			
 
				+	 veor		q2, q2, q11
			
 
				+	vshr.u64	q10, q6, #1
			
 
				+	 vshr.u64	q11, q0, #1
			
 
				+	veor		q10, q10, q4
			
 
				+	 veor		q11, q11, q1
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q4, q4, q10
			
 
				+	vshl.u64	q10, q10, #1
			
 
				+	 veor		q1, q1, q11
			
 
				+	 vshl.u64	q11, q11, #1
			
 
				+	veor		q6, q6, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	vmov.i8	q8,#0x0f			@ compose .LBS2
			
 
				+	vshr.u64	q10, q7, #2
			
 
				+	 vshr.u64	q11, q2, #2
			
 
				+	veor		q10, q10, q5
			
 
				+	 veor		q11, q11, q3
			
 
				+	vand		q10, q10, q9
			
 
				+	 vand		q11, q11, q9
			
 
				+	veor		q5, q5, q10
			
 
				+	vshl.u64	q10, q10, #2
			
 
				+	 veor		q3, q3, q11
			
 
				+	 vshl.u64	q11, q11, #2
			
 
				+	veor		q7, q7, q10
			
 
				+	 veor		q2, q2, q11
			
 
				+	vshr.u64	q10, q1, #2
			
 
				+	 vshr.u64	q11, q0, #2
			
 
				+	veor		q10, q10, q4
			
 
				+	 veor		q11, q11, q6
			
 
				+	vand		q10, q10, q9
			
 
				+	 vand		q11, q11, q9
			
 
				+	veor		q4, q4, q10
			
 
				+	vshl.u64	q10, q10, #2
			
 
				+	 veor		q6, q6, q11
			
 
				+	 vshl.u64	q11, q11, #2
			
 
				+	veor		q1, q1, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	vshr.u64	q10, q4, #4
			
 
				+	 vshr.u64	q11, q6, #4
			
 
				+	veor		q10, q10, q5
			
 
				+	 veor		q11, q11, q3
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q5, q5, q10
			
 
				+	vshl.u64	q10, q10, #4
			
 
				+	 veor		q3, q3, q11
			
 
				+	 vshl.u64	q11, q11, #4
			
 
				+	veor		q4, q4, q10
			
 
				+	 veor		q6, q6, q11
			
 
				+	vshr.u64	q10, q1, #4
			
 
				+	 vshr.u64	q11, q0, #4
			
 
				+	veor		q10, q10, q7
			
 
				+	 veor		q11, q11, q2
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q7, q7, q10
			
 
				+	vshl.u64	q10, q10, #4
			
 
				+	 veor		q2, q2, q11
			
 
				+	 vshl.u64	q11, q11, #4
			
 
				+	veor		q1, q1, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	vldmia	r4, {q8}			@ last round key
			
 
				+	veor	q6, q6, q8
			
 
				+	veor	q4, q4, q8
			
 
				+	veor	q2, q2, q8
			
 
				+	veor	q7, q7, q8
			
 
				+	veor	q3, q3, q8
			
 
				+	veor	q5, q5, q8
			
 
				+	veor	q0, q0, q8
			
 
				+	veor	q1, q1, q8
			
 
				+	bx	lr
			
 
				+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
			
 
				+
			
 
				+.type	_bsaes_const,%object
			
 
				+.align	6
			
 
				+_bsaes_const:
			
 
				+.LM0ISR:	@ InvShiftRows constants
			
 
				+	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
			
 
				+.LISR:
			
 
				+	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
			
 
				+.LISRM0:
			
 
				+	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
			
 
				+.LM0SR:		@ ShiftRows constants
			
 
				+	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
			
 
				+.LSR:
			
 
				+	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
			
 
				+.LSRM0:
			
 
				+	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
			
 
				+.LM0:
			
 
				+	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
			
 
				+.LREVM0SR:
			
 
				+	.quad	0x090d01050c000408, 0x03070b0f060a0e02
			
 
				+.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
			
 
				+.align	6
			
 
				+.size	_bsaes_const,.-_bsaes_const
			
 
				+
			
 
				+.type	_bsaes_encrypt8,%function
			
 
				+.align	4
			
 
				+_bsaes_encrypt8:
			
 
				+	adr	r6,_bsaes_encrypt8
			
 
				+	vldmia	r4!, {q9}		@ round 0 key
			
 
				+	sub	r6,r6,#_bsaes_encrypt8-.LM0SR
			
 
				+
			
 
				+	vldmia	r6!, {q8}		@ .LM0SR
			
 
				+_bsaes_encrypt8_alt:
			
 
				+	veor	q10, q0, q9	@ xor with round0 key
			
 
				+	veor	q11, q1, q9
			
 
				+	 vtbl.8	d0, {q10}, d16
			
 
				+	 vtbl.8	d1, {q10}, d17
			
 
				+	veor	q12, q2, q9
			
 
				+	 vtbl.8	d2, {q11}, d16
			
 
				+	 vtbl.8	d3, {q11}, d17
			
 
				+	veor	q13, q3, q9
			
 
				+	 vtbl.8	d4, {q12}, d16
			
 
				+	 vtbl.8	d5, {q12}, d17
			
 
				+	veor	q14, q4, q9
			
 
				+	 vtbl.8	d6, {q13}, d16
			
 
				+	 vtbl.8	d7, {q13}, d17
			
 
				+	veor	q15, q5, q9
			
 
				+	 vtbl.8	d8, {q14}, d16
			
 
				+	 vtbl.8	d9, {q14}, d17
			
 
				+	veor	q10, q6, q9
			
 
				+	 vtbl.8	d10, {q15}, d16
			
 
				+	 vtbl.8	d11, {q15}, d17
			
 
				+	veor	q11, q7, q9
			
 
				+	 vtbl.8	d12, {q10}, d16
			
 
				+	 vtbl.8	d13, {q10}, d17
			
 
				+	 vtbl.8	d14, {q11}, d16
			
 
				+	 vtbl.8	d15, {q11}, d17
			
 
				+_bsaes_encrypt8_bitslice:
			
 
				+	vmov.i8	q8,#0x55			@ compose .LBS0
			
 
				+	vmov.i8	q9,#0x33			@ compose .LBS1
			
 
				+	vshr.u64	q10, q6, #1
			
 
				+	 vshr.u64	q11, q4, #1
			
 
				+	veor		q10, q10, q7
			
 
				+	 veor		q11, q11, q5
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q7, q7, q10
			
 
				+	vshl.u64	q10, q10, #1
			
 
				+	 veor		q5, q5, q11
			
 
				+	 vshl.u64	q11, q11, #1
			
 
				+	veor		q6, q6, q10
			
 
				+	 veor		q4, q4, q11
			
 
				+	vshr.u64	q10, q2, #1
			
 
				+	 vshr.u64	q11, q0, #1
			
 
				+	veor		q10, q10, q3
			
 
				+	 veor		q11, q11, q1
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q3, q3, q10
			
 
				+	vshl.u64	q10, q10, #1
			
 
				+	 veor		q1, q1, q11
			
 
				+	 vshl.u64	q11, q11, #1
			
 
				+	veor		q2, q2, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	vmov.i8	q8,#0x0f			@ compose .LBS2
			
 
				+	vshr.u64	q10, q5, #2
			
 
				+	 vshr.u64	q11, q4, #2
			
 
				+	veor		q10, q10, q7
			
 
				+	 veor		q11, q11, q6
			
 
				+	vand		q10, q10, q9
			
 
				+	 vand		q11, q11, q9
			
 
				+	veor		q7, q7, q10
			
 
				+	vshl.u64	q10, q10, #2
			
 
				+	 veor		q6, q6, q11
			
 
				+	 vshl.u64	q11, q11, #2
			
 
				+	veor		q5, q5, q10
			
 
				+	 veor		q4, q4, q11
			
 
				+	vshr.u64	q10, q1, #2
			
 
				+	 vshr.u64	q11, q0, #2
			
 
				+	veor		q10, q10, q3
			
 
				+	 veor		q11, q11, q2
			
 
				+	vand		q10, q10, q9
			
 
				+	 vand		q11, q11, q9
			
 
				+	veor		q3, q3, q10
			
 
				+	vshl.u64	q10, q10, #2
			
 
				+	 veor		q2, q2, q11
			
 
				+	 vshl.u64	q11, q11, #2
			
 
				+	veor		q1, q1, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	vshr.u64	q10, q3, #4
			
 
				+	 vshr.u64	q11, q2, #4
			
 
				+	veor		q10, q10, q7
			
 
				+	 veor		q11, q11, q6
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q7, q7, q10
			
 
				+	vshl.u64	q10, q10, #4
			
 
				+	 veor		q6, q6, q11
			
 
				+	 vshl.u64	q11, q11, #4
			
 
				+	veor		q3, q3, q10
			
 
				+	 veor		q2, q2, q11
			
 
				+	vshr.u64	q10, q1, #4
			
 
				+	 vshr.u64	q11, q0, #4
			
 
				+	veor		q10, q10, q5
			
 
				+	 veor		q11, q11, q4
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q5, q5, q10
			
 
				+	vshl.u64	q10, q10, #4
			
 
				+	 veor		q4, q4, q11
			
 
				+	 vshl.u64	q11, q11, #4
			
 
				+	veor		q1, q1, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	sub	r5,r5,#1
			
 
				+	b	.Lenc_sbox
			
 
				+.align	4
			
 
				+.Lenc_loop:
			
 
				+	vldmia	r4!, {q8-q11}
			
 
				+	veor	q8, q8, q0
			
 
				+	veor	q9, q9, q1
			
 
				+	vtbl.8	d0, {q8}, d24
			
 
				+	vtbl.8	d1, {q8}, d25
			
 
				+	vldmia	r4!, {q8}
			
 
				+	veor	q10, q10, q2
			
 
				+	vtbl.8	d2, {q9}, d24
			
 
				+	vtbl.8	d3, {q9}, d25
			
 
				+	vldmia	r4!, {q9}
			
 
				+	veor	q11, q11, q3
			
 
				+	vtbl.8	d4, {q10}, d24
			
 
				+	vtbl.8	d5, {q10}, d25
			
 
				+	vldmia	r4!, {q10}
			
 
				+	vtbl.8	d6, {q11}, d24
			
 
				+	vtbl.8	d7, {q11}, d25
			
 
				+	vldmia	r4!, {q11}
			
 
				+	veor	q8, q8, q4
			
 
				+	veor	q9, q9, q5
			
 
				+	vtbl.8	d8, {q8}, d24
			
 
				+	vtbl.8	d9, {q8}, d25
			
 
				+	veor	q10, q10, q6
			
 
				+	vtbl.8	d10, {q9}, d24
			
 
				+	vtbl.8	d11, {q9}, d25
			
 
				+	veor	q11, q11, q7
			
 
				+	vtbl.8	d12, {q10}, d24
			
 
				+	vtbl.8	d13, {q10}, d25
			
 
				+	vtbl.8	d14, {q11}, d24
			
 
				+	vtbl.8	d15, {q11}, d25
			
 
				+.Lenc_sbox:
			
 
				+	veor	q2, q2, q1
			
 
				+	veor	q5, q5, q6
			
 
				+	veor	q3, q3, q0
			
 
				+	veor	q6, q6, q2
			
 
				+	veor	q5, q5, q0
			
 
				+
			
 
				+	veor	q6, q6, q3
			
 
				+	veor	q3, q3, q7
			
 
				+	veor	q7, q7, q5
			
 
				+	veor	q3, q3, q4
			
 
				+	veor	q4, q4, q5
			
 
				+
			
 
				+	veor	q2, q2, q7
			
 
				+	veor	q3, q3, q1
			
 
				+	veor	q1, q1, q5
			
 
				+	veor	q11, q7, q4
			
 
				+	veor	q10, q1, q2
			
 
				+	veor	q9, q5, q3
			
 
				+	veor	q13, q2, q4
			
 
				+	 vmov	q8, q10
			
 
				+	veor	q12, q6, q0
			
 
				+
			
 
				+	vorr	q10, q10, q9
			
 
				+	veor	q15, q11, q8
			
 
				+	vand	q14, q11, q12
			
 
				+	vorr	q11, q11, q12
			
 
				+	veor	q12, q12, q9
			
 
				+	vand	q8, q8, q9
			
 
				+	veor	q9, q3, q0
			
 
				+	vand	q15, q15, q12
			
 
				+	vand	q13, q13, q9
			
 
				+	veor	q9, q7, q1
			
 
				+	veor	q12, q5, q6
			
 
				+	veor	q11, q11, q13
			
 
				+	veor	q10, q10, q13
			
 
				+	vand	q13, q9, q12
			
 
				+	vorr	q9, q9, q12
			
 
				+	veor	q11, q11, q15
			
 
				+	veor	q8, q8, q13
			
 
				+	veor	q10, q10, q14
			
 
				+	veor	q9, q9, q15
			
 
				+	veor	q8, q8, q14
			
 
				+	vand	q12, q2, q3
			
 
				+	veor	q9, q9, q14
			
 
				+	vand	q13, q4, q0
			
 
				+	vand	q14, q1, q5
			
 
				+	vorr	q15, q7, q6
			
 
				+	veor	q11, q11, q12
			
 
				+	veor	q9, q9, q14
			
 
				+	veor	q8, q8, q15
			
 
				+	veor	q10, q10, q13
			
 
				+
			
 
				+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
			
 
				+
			
 
				+	@ new smaller inversion
			
 
				+
			
 
				+	vand	q14, q11, q9
			
 
				+	vmov	q12, q8
			
 
				+
			
 
				+	veor	q13, q10, q14
			
 
				+	veor	q15, q8, q14
			
 
				+	veor	q14, q8, q14	@ q14=q15
			
 
				+
			
 
				+	vbsl	q13, q9, q8
			
 
				+	vbsl	q15, q11, q10
			
 
				+	veor	q11, q11, q10
			
 
				+
			
 
				+	vbsl	q12, q13, q14
			
 
				+	vbsl	q8, q14, q13
			
 
				+
			
 
				+	vand	q14, q12, q15
			
 
				+	veor	q9, q9, q8
			
 
				+
			
 
				+	veor	q14, q14, q11
			
 
				+	veor	q12, q6, q0
			
 
				+	veor	q8, q5, q3
			
 
				+	veor 	q10, q15, q14
			
 
				+	vand	q10, q10, q6
			
 
				+	veor	q6, q6, q5
			
 
				+	vand	q11, q5, q15
			
 
				+	vand	q6, q6, q14
			
 
				+	veor	q5, q11, q10
			
 
				+	veor	q6, q6, q11
			
 
				+	veor	q15, q15, q13
			
 
				+	veor	q14, q14, q9
			
 
				+	veor	q11, q15, q14
			
 
				+	 veor 	q10, q13, q9
			
 
				+	vand	q11, q11, q12
			
 
				+	 vand	q10, q10, q0
			
 
				+	veor	q12, q12, q8
			
 
				+	 veor	q0, q0, q3
			
 
				+	vand	q8, q8, q15
			
 
				+	 vand	q3, q3, q13
			
 
				+	vand	q12, q12, q14
			
 
				+	 vand	q0, q0, q9
			
 
				+	veor	q8, q8, q12
			
 
				+	 veor	q0, q0, q3
			
 
				+	veor	q12, q12, q11
			
 
				+	 veor	q3, q3, q10
			
 
				+	veor	q6, q6, q12
			
 
				+	veor	q0, q0, q12
			
 
				+	veor	q5, q5, q8
			
 
				+	veor	q3, q3, q8
			
 
				+
			
 
				+	veor	q12, q7, q4
			
 
				+	veor	q8, q1, q2
			
 
				+	veor	q11, q15, q14
			
 
				+	 veor 	q10, q13, q9
			
 
				+	vand	q11, q11, q12
			
 
				+	 vand	q10, q10, q4
			
 
				+	veor	q12, q12, q8
			
 
				+	 veor	q4, q4, q2
			
 
				+	vand	q8, q8, q15
			
 
				+	 vand	q2, q2, q13
			
 
				+	vand	q12, q12, q14
			
 
				+	 vand	q4, q4, q9
			
 
				+	veor	q8, q8, q12
			
 
				+	 veor	q4, q4, q2
			
 
				+	veor	q12, q12, q11
			
 
				+	 veor	q2, q2, q10
			
 
				+	veor	q15, q15, q13
			
 
				+	veor	q14, q14, q9
			
 
				+	veor 	q10, q15, q14
			
 
				+	vand	q10, q10, q7
			
 
				+	veor	q7, q7, q1
			
 
				+	vand	q11, q1, q15
			
 
				+	vand	q7, q7, q14
			
 
				+	veor	q1, q11, q10
			
 
				+	veor	q7, q7, q11
			
 
				+	veor	q7, q7, q12
			
 
				+	veor	q4, q4, q12
			
 
				+	veor	q1, q1, q8
			
 
				+	veor	q2, q2, q8
			
 
				+	veor	q7, q7, q0
			
 
				+	veor	q1, q1, q6
			
 
				+	veor	q6, q6, q0
			
 
				+	veor	q4, q4, q7
			
 
				+	veor	q0, q0, q1
			
 
				+
			
 
				+	veor	q1, q1, q5
			
 
				+	veor	q5, q5, q2
			
 
				+	veor	q2, q2, q3
			
 
				+	veor	q3, q3, q5
			
 
				+	veor	q4, q4, q5
			
 
				+
			
 
				+	veor	q6, q6, q3
			
 
				+	subs	r5,r5,#1
			
 
				+	bcc	.Lenc_done
			
 
				+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
			
 
				+	vext.8	q9, q1, q1, #12
			
 
				+	 veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
			
 
				+	vext.8	q10, q4, q4, #12
			
 
				+	 veor	q1, q1, q9
			
 
				+	vext.8	q11, q6, q6, #12
			
 
				+	 veor	q4, q4, q10
			
 
				+	vext.8	q12, q3, q3, #12
			
 
				+	 veor	q6, q6, q11
			
 
				+	vext.8	q13, q7, q7, #12
			
 
				+	 veor	q3, q3, q12
			
 
				+	vext.8	q14, q2, q2, #12
			
 
				+	 veor	q7, q7, q13
			
 
				+	vext.8	q15, q5, q5, #12
			
 
				+	 veor	q2, q2, q14
			
 
				+
			
 
				+	veor	q9, q9, q0
			
 
				+	 veor	q5, q5, q15
			
 
				+	 vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
			
 
				+	veor	q10, q10, q1
			
 
				+	veor	q8, q8, q5
			
 
				+	veor	q9, q9, q5
			
 
				+	 vext.8	q1, q1, q1, #8
			
 
				+	veor	q13, q13, q3
			
 
				+	 veor	q0, q0, q8
			
 
				+	veor	q14, q14, q7
			
 
				+	 veor	q1, q1, q9
			
 
				+	 vext.8	q8, q3, q3, #8
			
 
				+	veor	q12, q12, q6
			
 
				+	 vext.8	q9, q7, q7, #8
			
 
				+	veor	q15, q15, q2
			
 
				+	 vext.8	q3, q6, q6, #8
			
 
				+	veor	q11, q11, q4
			
 
				+	 vext.8	q7, q5, q5, #8
			
 
				+	veor	q12, q12, q5
			
 
				+	 vext.8	q6, q2, q2, #8
			
 
				+	veor	q11, q11, q5
			
 
				+	 vext.8	q2, q4, q4, #8
			
 
				+	veor	q5, q9, q13
			
 
				+	veor	q4, q8, q12
			
 
				+	veor	q3, q3, q11
			
 
				+	veor	q7, q7, q15
			
 
				+	veor	q6, q6, q14
			
 
				+	 @ vmov	q4, q8
			
 
				+	veor	q2, q2, q10
			
 
				+	 @ vmov	q5, q9
			
 
				+	vldmia	r6, {q12}		@ .LSR
			
 
				+	ite	eq				@ Thumb2 thing, samity check in ARM
			
 
				+	addeq	r6,r6,#0x10
			
 
				+	bne	.Lenc_loop
			
 
				+	vldmia	r6, {q12}		@ .LSRM0
			
 
				+	b	.Lenc_loop
			
 
				+.align	4
			
 
				+.Lenc_done:
			
 
				+	vmov.i8	q8,#0x55			@ compose .LBS0
			
 
				+	vmov.i8	q9,#0x33			@ compose .LBS1
			
 
				+	vshr.u64	q10, q2, #1
			
 
				+	 vshr.u64	q11, q3, #1
			
 
				+	veor		q10, q10, q5
			
 
				+	 veor		q11, q11, q7
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q5, q5, q10
			
 
				+	vshl.u64	q10, q10, #1
			
 
				+	 veor		q7, q7, q11
			
 
				+	 vshl.u64	q11, q11, #1
			
 
				+	veor		q2, q2, q10
			
 
				+	 veor		q3, q3, q11
			
 
				+	vshr.u64	q10, q4, #1
			
 
				+	 vshr.u64	q11, q0, #1
			
 
				+	veor		q10, q10, q6
			
 
				+	 veor		q11, q11, q1
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q6, q6, q10
			
 
				+	vshl.u64	q10, q10, #1
			
 
				+	 veor		q1, q1, q11
			
 
				+	 vshl.u64	q11, q11, #1
			
 
				+	veor		q4, q4, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	vmov.i8	q8,#0x0f			@ compose .LBS2
			
 
				+	vshr.u64	q10, q7, #2
			
 
				+	 vshr.u64	q11, q3, #2
			
 
				+	veor		q10, q10, q5
			
 
				+	 veor		q11, q11, q2
			
 
				+	vand		q10, q10, q9
			
 
				+	 vand		q11, q11, q9
			
 
				+	veor		q5, q5, q10
			
 
				+	vshl.u64	q10, q10, #2
			
 
				+	 veor		q2, q2, q11
			
 
				+	 vshl.u64	q11, q11, #2
			
 
				+	veor		q7, q7, q10
			
 
				+	 veor		q3, q3, q11
			
 
				+	vshr.u64	q10, q1, #2
			
 
				+	 vshr.u64	q11, q0, #2
			
 
				+	veor		q10, q10, q6
			
 
				+	 veor		q11, q11, q4
			
 
				+	vand		q10, q10, q9
			
 
				+	 vand		q11, q11, q9
			
 
				+	veor		q6, q6, q10
			
 
				+	vshl.u64	q10, q10, #2
			
 
				+	 veor		q4, q4, q11
			
 
				+	 vshl.u64	q11, q11, #2
			
 
				+	veor		q1, q1, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	vshr.u64	q10, q6, #4
			
 
				+	 vshr.u64	q11, q4, #4
			
 
				+	veor		q10, q10, q5
			
 
				+	 veor		q11, q11, q2
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q5, q5, q10
			
 
				+	vshl.u64	q10, q10, #4
			
 
				+	 veor		q2, q2, q11
			
 
				+	 vshl.u64	q11, q11, #4
			
 
				+	veor		q6, q6, q10
			
 
				+	 veor		q4, q4, q11
			
 
				+	vshr.u64	q10, q1, #4
			
 
				+	 vshr.u64	q11, q0, #4
			
 
				+	veor		q10, q10, q7
			
 
				+	 veor		q11, q11, q3
			
 
				+	vand		q10, q10, q8
			
 
				+	 vand		q11, q11, q8
			
 
				+	veor		q7, q7, q10
			
 
				+	vshl.u64	q10, q10, #4
			
 
				+	 veor		q3, q3, q11
			
 
				+	 vshl.u64	q11, q11, #4
			
 
				+	veor		q1, q1, q10
			
 
				+	 veor		q0, q0, q11
			
 
				+	vldmia	r4, {q8}			@ last round key
			
 
				+	veor	q4, q4, q8
			
 
				+	veor	q6, q6, q8
			
 
				+	veor	q3, q3, q8
			
 
				+	veor	q7, q7, q8
			
 
				+	veor	q2, q2, q8
			
 
				+	veor	q5, q5, q8
			
 
				+	veor	q0, q0, q8
			
 
				+	veor	q1, q1, q8
			
 
				+	bx	lr
			
 
				+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
			
 
				+.type	_bsaes_key_convert,%function
			
 
				+.align	4
			
 
				+_bsaes_key_convert:
			
 
				+	adr	r6,_bsaes_key_convert
			
 
				+	vld1.8	{q7},  [r4]!		@ load round 0 key
			
 
				+	sub	r6,r6,#_bsaes_key_convert-.LM0
			
 
				+	vld1.8	{q15}, [r4]!		@ load round 1 key
			
 
				+
			
 
				+	vmov.i8	q8,  #0x01			@ bit masks
			
 
				+	vmov.i8	q9,  #0x02
			
 
				+	vmov.i8	q10, #0x04
			
 
				+	vmov.i8	q11, #0x08
			
 
				+	vmov.i8	q12, #0x10
			
 
				+	vmov.i8	q13, #0x20
			
 
				+	vldmia	r6, {q14}		@ .LM0
			
 
				+
			
 
				+#ifdef __ARMEL__
			
 
				+	vrev32.8	q7,  q7
			
 
				+	vrev32.8	q15, q15
			
 
				+#endif
			
 
				+	sub	r5,r5,#1
			
 
				+	vstmia	r12!, {q7}		@ save round 0 key
			
 
				+	b	.Lkey_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lkey_loop:
			
 
				+	vtbl.8	d14,{q15},d28
			
 
				+	vtbl.8	d15,{q15},d29
			
 
				+	vmov.i8	q6,  #0x40
			
 
				+	vmov.i8	q15, #0x80
			
 
				+
			
 
				+	vtst.8	q0, q7, q8
			
 
				+	vtst.8	q1, q7, q9
			
 
				+	vtst.8	q2, q7, q10
			
 
				+	vtst.8	q3, q7, q11
			
 
				+	vtst.8	q4, q7, q12
			
 
				+	vtst.8	q5, q7, q13
			
 
				+	vtst.8	q6, q7, q6
			
 
				+	vtst.8	q7, q7, q15
			
 
				+	vld1.8	{q15}, [r4]!		@ load next round key
			
 
				+	vmvn	q0, q0		@ "pnot"
			
 
				+	vmvn	q1, q1
			
 
				+	vmvn	q5, q5
			
 
				+	vmvn	q6, q6
			
 
				+#ifdef __ARMEL__
			
 
				+	vrev32.8	q15, q15
			
 
				+#endif
			
 
				+	subs	r5,r5,#1
			
 
				+	vstmia	r12!,{q0-q7}		@ write bit-sliced round key
			
 
				+	bne	.Lkey_loop
			
 
				+
			
 
				+	vmov.i8	q7,#0x63			@ compose .L63
			
 
				+	@ don't save last round key
			
 
				+	bx	lr
			
 
				+.size	_bsaes_key_convert,.-_bsaes_key_convert
			
 
				+.extern AES_cbc_encrypt
			
 
				+.extern AES_decrypt
			
 
				+
			
 
				+.global	bsaes_cbc_encrypt
			
 
				+.type	bsaes_cbc_encrypt,%function
			
 
				+.align	5
			
 
				+bsaes_cbc_encrypt:
			
 
				+#ifndef	__KERNEL__
			
 
				+	cmp	r2, #128
			
 
				+#ifndef	__thumb__
			
 
				+	blo	AES_cbc_encrypt
			
 
				+#else
			
 
				+	bhs	1f
			
 
				+	b	AES_cbc_encrypt
			
 
				+1:
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+	@ it is up to the caller to make sure we are called with enc == 0
			
 
				+
			
 
				+	mov	ip, sp
			
 
				+	stmdb	sp!, {r4-r10, lr}
			
 
				+	VFP_ABI_PUSH
			
 
				+	ldr	r8, [ip]			@ IV is 1st arg on the stack
			
 
				+	mov	r2, r2, lsr#4		@ len in 16 byte blocks
			
 
				+	sub	sp, #0x10			@ scratch space to carry over the IV
			
 
				+	mov	r9, sp				@ save sp
			
 
				+
			
 
				+	ldr	r10, [r3, #240]		@ get # of rounds
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	@ allocate the key schedule on the stack
			
 
				+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
			
 
				+	add	r12, #96			@ sifze of bit-slices key schedule
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	mov	r4, r3			@ pass key
			
 
				+	mov	r5, r10			@ pass # of rounds
			
 
				+	mov	sp, r12				@ sp is sp
			
 
				+	bl	_bsaes_key_convert
			
 
				+	vldmia	sp, {q6}
			
 
				+	vstmia	r12,  {q15}		@ save last round key
			
 
				+	veor	q7, q7, q6	@ fix up round 0 key
			
 
				+	vstmia	sp, {q7}
			
 
				+#else
			
 
				+	ldr	r12, [r3, #244]
			
 
				+	eors	r12, #1
			
 
				+	beq	0f
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	str	r12, [r3, #244]
			
 
				+	mov	r4, r3			@ pass key
			
 
				+	mov	r5, r10			@ pass # of rounds
			
 
				+	add	r12, r3, #248			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	add	r4, r3, #248
			
 
				+	vldmia	r4, {q6}
			
 
				+	vstmia	r12, {q15}			@ save last round key
			
 
				+	veor	q7, q7, q6	@ fix up round 0 key
			
 
				+	vstmia	r4, {q7}
			
 
				+
			
 
				+.align	2
			
 
				+0:
			
 
				+#endif
			
 
				+
			
 
				+	vld1.8	{q15}, [r8]		@ load IV
			
 
				+	b	.Lcbc_dec_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lcbc_dec_loop:
			
 
				+	subs	r2, r2, #0x8
			
 
				+	bmi	.Lcbc_dec_loop_finish
			
 
				+
			
 
				+	vld1.8	{q0-q1}, [r0]!	@ load input
			
 
				+	vld1.8	{q2-q3}, [r0]!
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	mov	r4, sp			@ pass the key
			
 
				+#else
			
 
				+	add	r4, r3, #248
			
 
				+#endif
			
 
				+	vld1.8	{q4-q5}, [r0]!
			
 
				+	mov	r5, r10
			
 
				+	vld1.8	{q6-q7}, [r0]
			
 
				+	sub	r0, r0, #0x60
			
 
				+	vstmia	r9, {q15}			@ put aside IV
			
 
				+
			
 
				+	bl	_bsaes_decrypt8
			
 
				+
			
 
				+	vldmia	r9, {q14}			@ reload IV
			
 
				+	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				+	veor	q0, q0, q14	@ ^= IV
			
 
				+	vld1.8	{q10-q11}, [r0]!
			
 
				+	veor	q1, q1, q8
			
 
				+	veor	q6, q6, q9
			
 
				+	vld1.8	{q12-q13}, [r0]!
			
 
				+	veor	q4, q4, q10
			
 
				+	veor	q2, q2, q11
			
 
				+	vld1.8	{q14-q15}, [r0]!
			
 
				+	veor	q7, q7, q12
			
 
				+	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				+	veor	q3, q3, q13
			
 
				+	vst1.8	{q6}, [r1]!
			
 
				+	veor	q5, q5, q14
			
 
				+	vst1.8	{q4}, [r1]!
			
 
				+	vst1.8	{q2}, [r1]!
			
 
				+	vst1.8	{q7}, [r1]!
			
 
				+	vst1.8	{q3}, [r1]!
			
 
				+	vst1.8	{q5}, [r1]!
			
 
				+
			
 
				+	b	.Lcbc_dec_loop
			
 
				+
			
 
				+.Lcbc_dec_loop_finish:
			
 
				+	adds	r2, r2, #8
			
 
				+	beq	.Lcbc_dec_done
			
 
				+
			
 
				+	vld1.8	{q0}, [r0]!		@ load input
			
 
				+	cmp	r2, #2
			
 
				+	blo	.Lcbc_dec_one
			
 
				+	vld1.8	{q1}, [r0]!
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	mov	r4, sp			@ pass the key
			
 
				+#else
			
 
				+	add	r4, r3, #248
			
 
				+#endif
			
 
				+	mov	r5, r10
			
 
				+	vstmia	r9, {q15}			@ put aside IV
			
 
				+	beq	.Lcbc_dec_two
			
 
				+	vld1.8	{q2}, [r0]!
			
 
				+	cmp	r2, #4
			
 
				+	blo	.Lcbc_dec_three
			
 
				+	vld1.8	{q3}, [r0]!
			
 
				+	beq	.Lcbc_dec_four
			
 
				+	vld1.8	{q4}, [r0]!
			
 
				+	cmp	r2, #6
			
 
				+	blo	.Lcbc_dec_five
			
 
				+	vld1.8	{q5}, [r0]!
			
 
				+	beq	.Lcbc_dec_six
			
 
				+	vld1.8	{q6}, [r0]!
			
 
				+	sub	r0, r0, #0x70
			
 
				+
			
 
				+	bl	_bsaes_decrypt8
			
 
				+
			
 
				+	vldmia	r9, {q14}			@ reload IV
			
 
				+	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				+	veor	q0, q0, q14	@ ^= IV
			
 
				+	vld1.8	{q10-q11}, [r0]!
			
 
				+	veor	q1, q1, q8
			
 
				+	veor	q6, q6, q9
			
 
				+	vld1.8	{q12-q13}, [r0]!
			
 
				+	veor	q4, q4, q10
			
 
				+	veor	q2, q2, q11
			
 
				+	vld1.8	{q15}, [r0]!
			
 
				+	veor	q7, q7, q12
			
 
				+	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				+	veor	q3, q3, q13
			
 
				+	vst1.8	{q6}, [r1]!
			
 
				+	vst1.8	{q4}, [r1]!
			
 
				+	vst1.8	{q2}, [r1]!
			
 
				+	vst1.8	{q7}, [r1]!
			
 
				+	vst1.8	{q3}, [r1]!
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_six:
			
 
				+	sub	r0, r0, #0x60
			
 
				+	bl	_bsaes_decrypt8
			
 
				+	vldmia	r9,{q14}			@ reload IV
			
 
				+	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				+	veor	q0, q0, q14	@ ^= IV
			
 
				+	vld1.8	{q10-q11}, [r0]!
			
 
				+	veor	q1, q1, q8
			
 
				+	veor	q6, q6, q9
			
 
				+	vld1.8	{q12}, [r0]!
			
 
				+	veor	q4, q4, q10
			
 
				+	veor	q2, q2, q11
			
 
				+	vld1.8	{q15}, [r0]!
			
 
				+	veor	q7, q7, q12
			
 
				+	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				+	vst1.8	{q6}, [r1]!
			
 
				+	vst1.8	{q4}, [r1]!
			
 
				+	vst1.8	{q2}, [r1]!
			
 
				+	vst1.8	{q7}, [r1]!
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_five:
			
 
				+	sub	r0, r0, #0x50
			
 
				+	bl	_bsaes_decrypt8
			
 
				+	vldmia	r9, {q14}			@ reload IV
			
 
				+	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				+	veor	q0, q0, q14	@ ^= IV
			
 
				+	vld1.8	{q10-q11}, [r0]!
			
 
				+	veor	q1, q1, q8
			
 
				+	veor	q6, q6, q9
			
 
				+	vld1.8	{q15}, [r0]!
			
 
				+	veor	q4, q4, q10
			
 
				+	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				+	veor	q2, q2, q11
			
 
				+	vst1.8	{q6}, [r1]!
			
 
				+	vst1.8	{q4}, [r1]!
			
 
				+	vst1.8	{q2}, [r1]!
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_four:
			
 
				+	sub	r0, r0, #0x40
			
 
				+	bl	_bsaes_decrypt8
			
 
				+	vldmia	r9, {q14}			@ reload IV
			
 
				+	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				+	veor	q0, q0, q14	@ ^= IV
			
 
				+	vld1.8	{q10}, [r0]!
			
 
				+	veor	q1, q1, q8
			
 
				+	veor	q6, q6, q9
			
 
				+	vld1.8	{q15}, [r0]!
			
 
				+	veor	q4, q4, q10
			
 
				+	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				+	vst1.8	{q6}, [r1]!
			
 
				+	vst1.8	{q4}, [r1]!
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_three:
			
 
				+	sub	r0, r0, #0x30
			
 
				+	bl	_bsaes_decrypt8
			
 
				+	vldmia	r9, {q14}			@ reload IV
			
 
				+	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				+	veor	q0, q0, q14	@ ^= IV
			
 
				+	vld1.8	{q15}, [r0]!
			
 
				+	veor	q1, q1, q8
			
 
				+	veor	q6, q6, q9
			
 
				+	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				+	vst1.8	{q6}, [r1]!
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_two:
			
 
				+	sub	r0, r0, #0x20
			
 
				+	bl	_bsaes_decrypt8
			
 
				+	vldmia	r9, {q14}			@ reload IV
			
 
				+	vld1.8	{q8}, [r0]!		@ reload input
			
 
				+	veor	q0, q0, q14	@ ^= IV
			
 
				+	vld1.8	{q15}, [r0]!		@ reload input
			
 
				+	veor	q1, q1, q8
			
 
				+	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_one:
			
 
				+	sub	r0, r0, #0x10
			
 
				+	mov	r10, r1			@ save original out pointer
			
 
				+	mov	r1, r9			@ use the iv scratch space as out buffer
			
 
				+	mov	r2, r3
			
 
				+	vmov	q4,q15		@ just in case ensure that IV
			
 
				+	vmov	q5,q0			@ and input are preserved
			
 
				+	bl	AES_decrypt
			
 
				+	vld1.8	{q0}, [r9,:64]		@ load result
			
 
				+	veor	q0, q0, q4	@ ^= IV
			
 
				+	vmov	q15, q5		@ q5 holds input
			
 
				+	vst1.8	{q0}, [r10]		@ write output
			
 
				+
			
 
				+.Lcbc_dec_done:
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	vmov.i32	q0, #0
			
 
				+	vmov.i32	q1, #0
			
 
				+.Lcbc_dec_bzero:				@ wipe key schedule [if any]
			
 
				+	vstmia		sp!, {q0-q1}
			
 
				+	cmp		sp, r9
			
 
				+	bne		.Lcbc_dec_bzero
			
 
				+#endif
			
 
				+
			
 
				+	mov	sp, r9
			
 
				+	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
			
 
				+	vst1.8	{q15}, [r8]		@ return IV
			
 
				+	VFP_ABI_POP
			
 
				+	ldmia	sp!, {r4-r10, pc}
			
 
				+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
			
 
				+.extern	AES_encrypt
			
 
				+.global	bsaes_ctr32_encrypt_blocks
			
 
				+.type	bsaes_ctr32_encrypt_blocks,%function
			
 
				+.align	5
			
 
				+bsaes_ctr32_encrypt_blocks:
			
 
				+	cmp	r2, #8			@ use plain AES for
			
 
				+	blo	.Lctr_enc_short			@ small sizes
			
 
				+
			
 
				+	mov	ip, sp
			
 
				+	stmdb	sp!, {r4-r10, lr}
			
 
				+	VFP_ABI_PUSH
			
 
				+	ldr	r8, [ip]			@ ctr is 1st arg on the stack
			
 
				+	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
			
 
				+	mov	r9, sp				@ save sp
			
 
				+
			
 
				+	ldr	r10, [r3, #240]		@ get # of rounds
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	@ allocate the key schedule on the stack
			
 
				+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
			
 
				+	add	r12, #96			@ size of bit-sliced key schedule
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	mov	r4, r3			@ pass key
			
 
				+	mov	r5, r10			@ pass # of rounds
			
 
				+	mov	sp, r12				@ sp is sp
			
 
				+	bl	_bsaes_key_convert
			
 
				+	veor	q7,q7,q15	@ fix up last round key
			
 
				+	vstmia	r12, {q7}			@ save last round key
			
 
				+
			
 
				+	vld1.8	{q0}, [r8]		@ load counter
			
 
				+	add	r8, r6, #.LREVM0SR-.LM0	@ borrow r8
			
 
				+	vldmia	sp, {q4}		@ load round0 key
			
 
				+#else
			
 
				+	ldr	r12, [r3, #244]
			
 
				+	eors	r12, #1
			
 
				+	beq	0f
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	str	r12, [r3, #244]
			
 
				+	mov	r4, r3			@ pass key
			
 
				+	mov	r5, r10			@ pass # of rounds
			
 
				+	add	r12, r3, #248			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	veor	q7,q7,q15	@ fix up last round key
			
 
				+	vstmia	r12, {q7}			@ save last round key
			
 
				+
			
 
				+.align	2
			
 
				+0:	add	r12, r3, #248
			
 
				+	vld1.8	{q0}, [r8]		@ load counter
			
 
				+	adrl	r8, .LREVM0SR			@ borrow r8
			
 
				+	vldmia	r12, {q4}			@ load round0 key
			
 
				+	sub	sp, #0x10			@ place for adjusted round0 key
			
 
				+#endif
			
 
				+
			
 
				+	vmov.i32	q8,#1		@ compose 1<<96
			
 
				+	veor		q9,q9,q9
			
 
				+	vrev32.8	q0,q0
			
 
				+	vext.8		q8,q9,q8,#4
			
 
				+	vrev32.8	q4,q4
			
 
				+	vadd.u32	q9,q8,q8	@ compose 2<<96
			
 
				+	vstmia	sp, {q4}		@ save adjusted round0 key
			
 
				+	b	.Lctr_enc_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lctr_enc_loop:
			
 
				+	vadd.u32	q10, q8, q9	@ compose 3<<96
			
 
				+	vadd.u32	q1, q0, q8	@ +1
			
 
				+	vadd.u32	q2, q0, q9	@ +2
			
 
				+	vadd.u32	q3, q0, q10	@ +3
			
 
				+	vadd.u32	q4, q1, q10
			
 
				+	vadd.u32	q5, q2, q10
			
 
				+	vadd.u32	q6, q3, q10
			
 
				+	vadd.u32	q7, q4, q10
			
 
				+	vadd.u32	q10, q5, q10	@ next counter
			
 
				+
			
 
				+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
			
 
				+	@ to flip byte order in 32-bit counter
			
 
				+
			
 
				+	vldmia		sp, {q9}		@ load round0 key
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x10		@ pass next round key
			
 
				+#else
			
 
				+	add		r4, r3, #264
			
 
				+#endif
			
 
				+	vldmia		r8, {q8}			@ .LREVM0SR
			
 
				+	mov		r5, r10			@ pass rounds
			
 
				+	vstmia		r9, {q10}			@ save next counter
			
 
				+	sub		r6, r8, #.LREVM0SR-.LSR	@ pass constants
			
 
				+
			
 
				+	bl		_bsaes_encrypt8_alt
			
 
				+
			
 
				+	subs		r2, r2, #8
			
 
				+	blo		.Lctr_enc_loop_done
			
 
				+
			
 
				+	vld1.8		{q8-q9}, [r0]!	@ load input
			
 
				+	vld1.8		{q10-q11}, [r0]!
			
 
				+	veor		q0, q8
			
 
				+	veor		q1, q9
			
 
				+	vld1.8		{q12-q13}, [r0]!
			
 
				+	veor		q4, q10
			
 
				+	veor		q6, q11
			
 
				+	vld1.8		{q14-q15}, [r0]!
			
 
				+	veor		q3, q12
			
 
				+	vst1.8		{q0-q1}, [r1]!	@ write output
			
 
				+	veor		q7, q13
			
 
				+	veor		q2, q14
			
 
				+	vst1.8		{q4}, [r1]!
			
 
				+	veor		q5, q15
			
 
				+	vst1.8		{q6}, [r1]!
			
 
				+	vmov.i32	q8, #1			@ compose 1<<96
			
 
				+	vst1.8		{q3}, [r1]!
			
 
				+	veor		q9, q9, q9
			
 
				+	vst1.8		{q7}, [r1]!
			
 
				+	vext.8		q8, q9, q8, #4
			
 
				+	vst1.8		{q2}, [r1]!
			
 
				+	vadd.u32	q9,q8,q8		@ compose 2<<96
			
 
				+	vst1.8		{q5}, [r1]!
			
 
				+	vldmia		r9, {q0}			@ load counter
			
 
				+
			
 
				+	bne		.Lctr_enc_loop
			
 
				+	b		.Lctr_enc_done
			
 
				+
			
 
				+.align	4
			
 
				+.Lctr_enc_loop_done:
			
 
				+	add		r2, r2, #8
			
 
				+	vld1.8		{q8}, [r0]!	@ load input
			
 
				+	veor		q0, q8
			
 
				+	vst1.8		{q0}, [r1]!	@ write output
			
 
				+	cmp		r2, #2
			
 
				+	blo		.Lctr_enc_done
			
 
				+	vld1.8		{q9}, [r0]!
			
 
				+	veor		q1, q9
			
 
				+	vst1.8		{q1}, [r1]!
			
 
				+	beq		.Lctr_enc_done
			
 
				+	vld1.8		{q10}, [r0]!
			
 
				+	veor		q4, q10
			
 
				+	vst1.8		{q4}, [r1]!
			
 
				+	cmp		r2, #4
			
 
				+	blo		.Lctr_enc_done
			
 
				+	vld1.8		{q11}, [r0]!
			
 
				+	veor		q6, q11
			
 
				+	vst1.8		{q6}, [r1]!
			
 
				+	beq		.Lctr_enc_done
			
 
				+	vld1.8		{q12}, [r0]!
			
 
				+	veor		q3, q12
			
 
				+	vst1.8		{q3}, [r1]!
			
 
				+	cmp		r2, #6
			
 
				+	blo		.Lctr_enc_done
			
 
				+	vld1.8		{q13}, [r0]!
			
 
				+	veor		q7, q13
			
 
				+	vst1.8		{q7}, [r1]!
			
 
				+	beq		.Lctr_enc_done
			
 
				+	vld1.8		{q14}, [r0]
			
 
				+	veor		q2, q14
			
 
				+	vst1.8		{q2}, [r1]!
			
 
				+
			
 
				+.Lctr_enc_done:
			
 
				+	vmov.i32	q0, #0
			
 
				+	vmov.i32	q1, #0
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+.Lctr_enc_bzero:			@ wipe key schedule [if any]
			
 
				+	vstmia		sp!, {q0-q1}
			
 
				+	cmp		sp, r9
			
 
				+	bne		.Lctr_enc_bzero
			
 
				+#else
			
 
				+	vstmia		sp, {q0-q1}
			
 
				+#endif
			
 
				+
			
 
				+	mov	sp, r9
			
 
				+	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
			
 
				+	VFP_ABI_POP
			
 
				+	ldmia	sp!, {r4-r10, pc}	@ return
			
 
				+
			
 
				+.align	4
			
 
				+.Lctr_enc_short:
			
 
				+	ldr	ip, [sp]		@ ctr pointer is passed on stack
			
 
				+	stmdb	sp!, {r4-r8, lr}
			
 
				+
			
 
				+	mov	r4, r0		@ copy arguments
			
 
				+	mov	r5, r1
			
 
				+	mov	r6, r2
			
 
				+	mov	r7, r3
			
 
				+	ldr	r8, [ip, #12]		@ load counter LSW
			
 
				+	vld1.8	{q1}, [ip]		@ load whole counter value
			
 
				+#ifdef __ARMEL__
			
 
				+	rev	r8, r8
			
 
				+#endif
			
 
				+	sub	sp, sp, #0x10
			
 
				+	vst1.8	{q1}, [sp,:64]	@ copy counter value
			
 
				+	sub	sp, sp, #0x10
			
 
				+
			
 
				+.Lctr_enc_short_loop:
			
 
				+	add	r0, sp, #0x10		@ input counter value
			
 
				+	mov	r1, sp			@ output on the stack
			
 
				+	mov	r2, r7			@ key
			
 
				+
			
 
				+	bl	AES_encrypt
			
 
				+
			
 
				+	vld1.8	{q0}, [r4]!	@ load input
			
 
				+	vld1.8	{q1}, [sp,:64]	@ load encrypted counter
			
 
				+	add	r8, r8, #1
			
 
				+#ifdef __ARMEL__
			
 
				+	rev	r0, r8
			
 
				+	str	r0, [sp, #0x1c]		@ next counter value
			
 
				+#else
			
 
				+	str	r8, [sp, #0x1c]		@ next counter value
			
 
				+#endif
			
 
				+	veor	q0,q0,q1
			
 
				+	vst1.8	{q0}, [r5]!	@ store output
			
 
				+	subs	r6, r6, #1
			
 
				+	bne	.Lctr_enc_short_loop
			
 
				+
			
 
				+	vmov.i32	q0, #0
			
 
				+	vmov.i32	q1, #0
			
 
				+	vstmia		sp!, {q0-q1}
			
 
				+
			
 
				+	ldmia	sp!, {r4-r8, pc}
			
 
				+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
			
 
				+.globl	bsaes_xts_encrypt
			
 
				+.type	bsaes_xts_encrypt,%function
			
 
				+.align	4
			
 
				+bsaes_xts_encrypt:
			
 
				+	mov	ip, sp
			
 
				+	stmdb	sp!, {r4-r10, lr}		@ 0x20
			
 
				+	VFP_ABI_PUSH
			
 
				+	mov	r6, sp				@ future r3
			
 
				+
			
 
				+	mov	r7, r0
			
 
				+	mov	r8, r1
			
 
				+	mov	r9, r2
			
 
				+	mov	r10, r3
			
 
				+
			
 
				+	sub	r0, sp, #0x10			@ 0x10
			
 
				+	bic	r0, #0xf			@ align at 16 bytes
			
 
				+	mov	sp, r0
			
 
				+
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	ldr	r0, [ip]			@ pointer to input tweak
			
 
				+#else
			
 
				+	@ generate initial tweak
			
 
				+	ldr	r0, [ip, #4]			@ iv[]
			
 
				+	mov	r1, sp
			
 
				+	ldr	r2, [ip, #0]			@ key2
			
 
				+	bl	AES_encrypt
			
 
				+	mov	r0,sp				@ pointer to initial tweak
			
 
				+#endif
			
 
				+
			
 
				+	ldr	r1, [r10, #240]		@ get # of rounds
			
 
				+	mov	r3, r6
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	@ allocate the key schedule on the stack
			
 
				+	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
			
 
				+	@ add	r12, #96			@ size of bit-sliced key schedule
			
 
				+	sub	r12, #48			@ place for tweak[9]
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	mov	r4, r10			@ pass key
			
 
				+	mov	r5, r1			@ pass # of rounds
			
 
				+	mov	sp, r12
			
 
				+	add	r12, #0x90			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	veor	q7, q7, q15	@ fix up last round key
			
 
				+	vstmia	r12, {q7}			@ save last round key
			
 
				+#else
			
 
				+	ldr	r12, [r10, #244]
			
 
				+	eors	r12, #1
			
 
				+	beq	0f
			
 
				+
			
 
				+	str	r12, [r10, #244]
			
 
				+	mov	r4, r10			@ pass key
			
 
				+	mov	r5, r1			@ pass # of rounds
			
 
				+	add	r12, r10, #248			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	veor	q7, q7, q15	@ fix up last round key
			
 
				+	vstmia	r12, {q7}
			
 
				+
			
 
				+.align	2
			
 
				+0:	sub	sp, #0x90			@ place for tweak[9]
			
 
				+#endif
			
 
				+
			
 
				+	vld1.8	{q8}, [r0]			@ initial tweak
			
 
				+	adr	r2, .Lxts_magic
			
 
				+
			
 
				+	subs	r9, #0x80
			
 
				+	blo	.Lxts_enc_short
			
 
				+	b	.Lxts_enc_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lxts_enc_loop:
			
 
				+	vldmia		r2, {q5}	@ load XTS magic
			
 
				+	vshr.s64	q6, q8, #63
			
 
				+	mov		r0, sp
			
 
				+	vand		q6, q6, q5
			
 
				+	vadd.u64	q9, q8, q8
			
 
				+	vst1.64		{q8}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q9, #63
			
 
				+	veor		q9, q9, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vadd.u64	q10, q9, q9
			
 
				+	vst1.64		{q9}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q10, #63
			
 
				+	veor		q10, q10, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q0}, [r7]!
			
 
				+	vadd.u64	q11, q10, q10
			
 
				+	vst1.64		{q10}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q11, #63
			
 
				+	veor		q11, q11, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q1}, [r7]!
			
 
				+	veor		q0, q0, q8
			
 
				+	vadd.u64	q12, q11, q11
			
 
				+	vst1.64		{q11}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q12, #63
			
 
				+	veor		q12, q12, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q2}, [r7]!
			
 
				+	veor		q1, q1, q9
			
 
				+	vadd.u64	q13, q12, q12
			
 
				+	vst1.64		{q12}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q13, #63
			
 
				+	veor		q13, q13, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q3}, [r7]!
			
 
				+	veor		q2, q2, q10
			
 
				+	vadd.u64	q14, q13, q13
			
 
				+	vst1.64		{q13}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q14, #63
			
 
				+	veor		q14, q14, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q4}, [r7]!
			
 
				+	veor		q3, q3, q11
			
 
				+	vadd.u64	q15, q14, q14
			
 
				+	vst1.64		{q14}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q15, #63
			
 
				+	veor		q15, q15, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q5}, [r7]!
			
 
				+	veor		q4, q4, q12
			
 
				+	vadd.u64	q8, q15, q15
			
 
				+	vst1.64		{q15}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	veor		q8, q8, q7
			
 
				+	vst1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	vld1.8		{q6-q7}, [r7]!
			
 
				+	veor		q5, q5, q13
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q6, q6, q14
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	veor		q7, q7, q15
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10-q11}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	vld1.64		{q12-q13}, [r0,:128]!
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q4, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	veor		q9, q6, q11
			
 
				+	vld1.64		{q14-q15}, [r0,:128]!
			
 
				+	veor		q10, q3, q12
			
 
				+	vst1.8		{q8-q9}, [r8]!
			
 
				+	veor		q11, q7, q13
			
 
				+	veor		q12, q2, q14
			
 
				+	vst1.8		{q10-q11}, [r8]!
			
 
				+	veor		q13, q5, q15
			
 
				+	vst1.8		{q12-q13}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	subs		r9, #0x80
			
 
				+	bpl		.Lxts_enc_loop
			
 
				+
			
 
				+.Lxts_enc_short:
			
 
				+	adds		r9, #0x70
			
 
				+	bmi		.Lxts_enc_done
			
 
				+
			
 
				+	vldmia		r2, {q5}	@ load XTS magic
			
 
				+	vshr.s64	q7, q8, #63
			
 
				+	mov		r0, sp
			
 
				+	vand		q7, q7, q5
			
 
				+	vadd.u64	q9, q8, q8
			
 
				+	vst1.64		{q8}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q9, #63
			
 
				+	veor		q9, q9, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vadd.u64	q10, q9, q9
			
 
				+	vst1.64		{q9}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q10, #63
			
 
				+	veor		q10, q10, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q0}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_enc_1
			
 
				+	vadd.u64	q11, q10, q10
			
 
				+	vst1.64		{q10}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q11, #63
			
 
				+	veor		q11, q11, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q1}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_enc_2
			
 
				+	veor		q0, q0, q8
			
 
				+	vadd.u64	q12, q11, q11
			
 
				+	vst1.64		{q11}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q12, #63
			
 
				+	veor		q12, q12, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q2}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_enc_3
			
 
				+	veor		q1, q1, q9
			
 
				+	vadd.u64	q13, q12, q12
			
 
				+	vst1.64		{q12}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q13, #63
			
 
				+	veor		q13, q13, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q3}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_enc_4
			
 
				+	veor		q2, q2, q10
			
 
				+	vadd.u64	q14, q13, q13
			
 
				+	vst1.64		{q13}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q14, #63
			
 
				+	veor		q14, q14, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q4}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_enc_5
			
 
				+	veor		q3, q3, q11
			
 
				+	vadd.u64	q15, q14, q14
			
 
				+	vst1.64		{q14}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q15, #63
			
 
				+	veor		q15, q15, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q5}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_enc_6
			
 
				+	veor		q4, q4, q12
			
 
				+	sub		r9, #0x10
			
 
				+	vst1.64		{q15}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	vld1.8		{q6}, [r7]!
			
 
				+	veor		q5, q5, q13
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q6, q6, q14
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10-q11}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	vld1.64		{q12-q13}, [r0,:128]!
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q4, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	veor		q9, q6, q11
			
 
				+	vld1.64		{q14}, [r0,:128]!
			
 
				+	veor		q10, q3, q12
			
 
				+	vst1.8		{q8-q9}, [r8]!
			
 
				+	veor		q11, q7, q13
			
 
				+	veor		q12, q2, q14
			
 
				+	vst1.8		{q10-q11}, [r8]!
			
 
				+	vst1.8		{q12}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+.align	4
			
 
				+.Lxts_enc_6:
			
 
				+	vst1.64		{q14}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		q4, q4, q12
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q5, q5, q13
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10-q11}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	vld1.64		{q12-q13}, [r0,:128]!
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q4, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	veor		q9, q6, q11
			
 
				+	veor		q10, q3, q12
			
 
				+	vst1.8		{q8-q9}, [r8]!
			
 
				+	veor		q11, q7, q13
			
 
				+	vst1.8		{q10-q11}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+
			
 
				+@ put this in range for both ARM and Thumb mode adr instructions
			
 
				+.align	5
			
 
				+.Lxts_magic:
			
 
				+	.quad	1, 0x87
			
 
				+
			
 
				+.align	5
			
 
				+.Lxts_enc_5:
			
 
				+	vst1.64		{q13}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		q3, q3, q11
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q4, q4, q12
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10-q11}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	vld1.64		{q12}, [r0,:128]!
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q4, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	veor		q9, q6, q11
			
 
				+	veor		q10, q3, q12
			
 
				+	vst1.8		{q8-q9}, [r8]!
			
 
				+	vst1.8		{q10}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+.align	4
			
 
				+.Lxts_enc_4:
			
 
				+	vst1.64		{q12}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		q2, q2, q10
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q3, q3, q11
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10-q11}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q4, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	veor		q9, q6, q11
			
 
				+	vst1.8		{q8-q9}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+.align	4
			
 
				+.Lxts_enc_3:
			
 
				+	vst1.64		{q11}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		q1, q1, q9
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q2, q2, q10
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q4, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	vst1.8		{q8}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+.align	4
			
 
				+.Lxts_enc_2:
			
 
				+	vst1.64		{q10}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		q0, q0, q8
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q1, q1, q9
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	veor		q1, q1, q9
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+.align	4
			
 
				+.Lxts_enc_1:
			
 
				+	mov		r0, sp
			
 
				+	veor		q0, q8
			
 
				+	mov		r1, sp
			
 
				+	vst1.8		{q0}, [sp,:128]
			
 
				+	mov		r2, r10
			
 
				+	mov		r4, r3				@ preserve fp
			
 
				+
			
 
				+	bl		AES_encrypt
			
 
				+
			
 
				+	vld1.8		{q0}, [sp,:128]
			
 
				+	veor		q0, q0, q8
			
 
				+	vst1.8		{q0}, [r8]!
			
 
				+	mov		r3, r4
			
 
				+
			
 
				+	vmov		q8, q9		@ next round tweak
			
 
				+
			
 
				+.Lxts_enc_done:
			
 
				+#ifndef	XTS_CHAIN_TWEAK
			
 
				+	adds		r9, #0x10
			
 
				+	beq		.Lxts_enc_ret
			
 
				+	sub		r6, r8, #0x10
			
 
				+
			
 
				+.Lxts_enc_steal:
			
 
				+	ldrb		r0, [r7], #1
			
 
				+	ldrb		r1, [r8, #-0x10]
			
 
				+	strb		r0, [r8, #-0x10]
			
 
				+	strb		r1, [r8], #1
			
 
				+
			
 
				+	subs		r9, #1
			
 
				+	bhi		.Lxts_enc_steal
			
 
				+
			
 
				+	vld1.8		{q0}, [r6]
			
 
				+	mov		r0, sp
			
 
				+	veor		q0, q0, q8
			
 
				+	mov		r1, sp
			
 
				+	vst1.8		{q0}, [sp,:128]
			
 
				+	mov		r2, r10
			
 
				+	mov		r4, r3			@ preserve fp
			
 
				+
			
 
				+	bl		AES_encrypt
			
 
				+
			
 
				+	vld1.8		{q0}, [sp,:128]
			
 
				+	veor		q0, q0, q8
			
 
				+	vst1.8		{q0}, [r6]
			
 
				+	mov		r3, r4
			
 
				+#endif
			
 
				+
			
 
				+.Lxts_enc_ret:
			
 
				+	bic		r0, r3, #0xf
			
 
				+	vmov.i32	q0, #0
			
 
				+	vmov.i32	q1, #0
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	ldr		r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
			
 
				+#endif
			
 
				+.Lxts_enc_bzero:				@ wipe key schedule [if any]
			
 
				+	vstmia		sp!, {q0-q1}
			
 
				+	cmp		sp, r0
			
 
				+	bne		.Lxts_enc_bzero
			
 
				+
			
 
				+	mov		sp, r3
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	vst1.8		{q8}, [r1]
			
 
				+#endif
			
 
				+	VFP_ABI_POP
			
 
				+	ldmia		sp!, {r4-r10, pc}	@ return
			
 
				+
			
 
				+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
			
 
				+
			
 
				+.globl	bsaes_xts_decrypt
			
 
				+.type	bsaes_xts_decrypt,%function
			
 
				+.align	4
			
 
				+bsaes_xts_decrypt:
			
 
				+	mov	ip, sp
			
 
				+	stmdb	sp!, {r4-r10, lr}		@ 0x20
			
 
				+	VFP_ABI_PUSH
			
 
				+	mov	r6, sp				@ future r3
			
 
				+
			
 
				+	mov	r7, r0
			
 
				+	mov	r8, r1
			
 
				+	mov	r9, r2
			
 
				+	mov	r10, r3
			
 
				+
			
 
				+	sub	r0, sp, #0x10			@ 0x10
			
 
				+	bic	r0, #0xf			@ align at 16 bytes
			
 
				+	mov	sp, r0
			
 
				+
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	ldr	r0, [ip]			@ pointer to input tweak
			
 
				+#else
			
 
				+	@ generate initial tweak
			
 
				+	ldr	r0, [ip, #4]			@ iv[]
			
 
				+	mov	r1, sp
			
 
				+	ldr	r2, [ip, #0]			@ key2
			
 
				+	bl	AES_encrypt
			
 
				+	mov	r0, sp				@ pointer to initial tweak
			
 
				+#endif
			
 
				+
			
 
				+	ldr	r1, [r10, #240]		@ get # of rounds
			
 
				+	mov	r3, r6
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	@ allocate the key schedule on the stack
			
 
				+	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
			
 
				+	@ add	r12, #96			@ size of bit-sliced key schedule
			
 
				+	sub	r12, #48			@ place for tweak[9]
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	mov	r4, r10			@ pass key
			
 
				+	mov	r5, r1			@ pass # of rounds
			
 
				+	mov	sp, r12
			
 
				+	add	r12, #0x90			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	add	r4, sp, #0x90
			
 
				+	vldmia	r4, {q6}
			
 
				+	vstmia	r12,  {q15}		@ save last round key
			
 
				+	veor	q7, q7, q6	@ fix up round 0 key
			
 
				+	vstmia	r4, {q7}
			
 
				+#else
			
 
				+	ldr	r12, [r10, #244]
			
 
				+	eors	r12, #1
			
 
				+	beq	0f
			
 
				+
			
 
				+	str	r12, [r10, #244]
			
 
				+	mov	r4, r10			@ pass key
			
 
				+	mov	r5, r1			@ pass # of rounds
			
 
				+	add	r12, r10, #248			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	add	r4, r10, #248
			
 
				+	vldmia	r4, {q6}
			
 
				+	vstmia	r12,  {q15}		@ save last round key
			
 
				+	veor	q7, q7, q6	@ fix up round 0 key
			
 
				+	vstmia	r4, {q7}
			
 
				+
			
 
				+.align	2
			
 
				+0:	sub	sp, #0x90			@ place for tweak[9]
			
 
				+#endif
			
 
				+	vld1.8	{q8}, [r0]			@ initial tweak
			
 
				+	adr	r2, .Lxts_magic
			
 
				+
			
 
				+	tst	r9, #0xf			@ if not multiple of 16
			
 
				+	it	ne				@ Thumb2 thing, sanity check in ARM
			
 
				+	subne	r9, #0x10			@ subtract another 16 bytes
			
 
				+	subs	r9, #0x80
			
 
				+
			
 
				+	blo	.Lxts_dec_short
			
 
				+	b	.Lxts_dec_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lxts_dec_loop:
			
 
				+	vldmia		r2, {q5}	@ load XTS magic
			
 
				+	vshr.s64	q6, q8, #63
			
 
				+	mov		r0, sp
			
 
				+	vand		q6, q6, q5
			
 
				+	vadd.u64	q9, q8, q8
			
 
				+	vst1.64		{q8}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q9, #63
			
 
				+	veor		q9, q9, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vadd.u64	q10, q9, q9
			
 
				+	vst1.64		{q9}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q10, #63
			
 
				+	veor		q10, q10, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q0}, [r7]!
			
 
				+	vadd.u64	q11, q10, q10
			
 
				+	vst1.64		{q10}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q11, #63
			
 
				+	veor		q11, q11, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q1}, [r7]!
			
 
				+	veor		q0, q0, q8
			
 
				+	vadd.u64	q12, q11, q11
			
 
				+	vst1.64		{q11}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q12, #63
			
 
				+	veor		q12, q12, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q2}, [r7]!
			
 
				+	veor		q1, q1, q9
			
 
				+	vadd.u64	q13, q12, q12
			
 
				+	vst1.64		{q12}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q13, #63
			
 
				+	veor		q13, q13, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q3}, [r7]!
			
 
				+	veor		q2, q2, q10
			
 
				+	vadd.u64	q14, q13, q13
			
 
				+	vst1.64		{q13}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q14, #63
			
 
				+	veor		q14, q14, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q4}, [r7]!
			
 
				+	veor		q3, q3, q11
			
 
				+	vadd.u64	q15, q14, q14
			
 
				+	vst1.64		{q14}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q15, #63
			
 
				+	veor		q15, q15, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q5}, [r7]!
			
 
				+	veor		q4, q4, q12
			
 
				+	vadd.u64	q8, q15, q15
			
 
				+	vst1.64		{q15}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	veor		q8, q8, q7
			
 
				+	vst1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	vld1.8		{q6-q7}, [r7]!
			
 
				+	veor		q5, q5, q13
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q6, q6, q14
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	veor		q7, q7, q15
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10-q11}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	vld1.64		{q12-q13}, [r0,:128]!
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q6, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	veor		q9, q4, q11
			
 
				+	vld1.64		{q14-q15}, [r0,:128]!
			
 
				+	veor		q10, q2, q12
			
 
				+	vst1.8		{q8-q9}, [r8]!
			
 
				+	veor		q11, q7, q13
			
 
				+	veor		q12, q3, q14
			
 
				+	vst1.8		{q10-q11}, [r8]!
			
 
				+	veor		q13, q5, q15
			
 
				+	vst1.8		{q12-q13}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	subs		r9, #0x80
			
 
				+	bpl		.Lxts_dec_loop
			
 
				+
			
 
				+.Lxts_dec_short:
			
 
				+	adds		r9, #0x70
			
 
				+	bmi		.Lxts_dec_done
			
 
				+
			
 
				+	vldmia		r2, {q5}	@ load XTS magic
			
 
				+	vshr.s64	q7, q8, #63
			
 
				+	mov		r0, sp
			
 
				+	vand		q7, q7, q5
			
 
				+	vadd.u64	q9, q8, q8
			
 
				+	vst1.64		{q8}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q9, #63
			
 
				+	veor		q9, q9, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vadd.u64	q10, q9, q9
			
 
				+	vst1.64		{q9}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q10, #63
			
 
				+	veor		q10, q10, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q0}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_dec_1
			
 
				+	vadd.u64	q11, q10, q10
			
 
				+	vst1.64		{q10}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q11, #63
			
 
				+	veor		q11, q11, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q1}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_dec_2
			
 
				+	veor		q0, q0, q8
			
 
				+	vadd.u64	q12, q11, q11
			
 
				+	vst1.64		{q11}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q12, #63
			
 
				+	veor		q12, q12, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q2}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_dec_3
			
 
				+	veor		q1, q1, q9
			
 
				+	vadd.u64	q13, q12, q12
			
 
				+	vst1.64		{q12}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q13, #63
			
 
				+	veor		q13, q13, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q3}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_dec_4
			
 
				+	veor		q2, q2, q10
			
 
				+	vadd.u64	q14, q13, q13
			
 
				+	vst1.64		{q13}, [r0,:128]!
			
 
				+	vswp		d13,d12
			
 
				+	vshr.s64	q7, q14, #63
			
 
				+	veor		q14, q14, q6
			
 
				+	vand		q7, q7, q5
			
 
				+	vld1.8		{q4}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_dec_5
			
 
				+	veor		q3, q3, q11
			
 
				+	vadd.u64	q15, q14, q14
			
 
				+	vst1.64		{q14}, [r0,:128]!
			
 
				+	vswp		d15,d14
			
 
				+	vshr.s64	q6, q15, #63
			
 
				+	veor		q15, q15, q7
			
 
				+	vand		q6, q6, q5
			
 
				+	vld1.8		{q5}, [r7]!
			
 
				+	subs		r9, #0x10
			
 
				+	bmi		.Lxts_dec_6
			
 
				+	veor		q4, q4, q12
			
 
				+	sub		r9, #0x10
			
 
				+	vst1.64		{q15}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	vld1.8		{q6}, [r7]!
			
 
				+	veor		q5, q5, q13
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q6, q6, q14
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10-q11}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	vld1.64		{q12-q13}, [r0,:128]!
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q6, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	veor		q9, q4, q11
			
 
				+	vld1.64		{q14}, [r0,:128]!
			
 
				+	veor		q10, q2, q12
			
 
				+	vst1.8		{q8-q9}, [r8]!
			
 
				+	veor		q11, q7, q13
			
 
				+	veor		q12, q3, q14
			
 
				+	vst1.8		{q10-q11}, [r8]!
			
 
				+	vst1.8		{q12}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_6:
			
 
				+	vst1.64		{q14}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		q4, q4, q12
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q5, q5, q13
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10-q11}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	vld1.64		{q12-q13}, [r0,:128]!
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q6, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	veor		q9, q4, q11
			
 
				+	veor		q10, q2, q12
			
 
				+	vst1.8		{q8-q9}, [r8]!
			
 
				+	veor		q11, q7, q13
			
 
				+	vst1.8		{q10-q11}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_5:
			
 
				+	vst1.64		{q13}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		q3, q3, q11
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q4, q4, q12
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10-q11}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	vld1.64		{q12}, [r0,:128]!
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q6, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	veor		q9, q4, q11
			
 
				+	veor		q10, q2, q12
			
 
				+	vst1.8		{q8-q9}, [r8]!
			
 
				+	vst1.8		{q10}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_4:
			
 
				+	vst1.64		{q12}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		q2, q2, q10
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q3, q3, q11
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10-q11}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q6, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	veor		q9, q4, q11
			
 
				+	vst1.8		{q8-q9}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_3:
			
 
				+	vst1.64		{q11}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		q1, q1, q9
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q2, q2, q10
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	vld1.64		{q10}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	veor		q1, q1, q9
			
 
				+	veor		q8, q6, q10
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+	vst1.8		{q8}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_2:
			
 
				+	vst1.64		{q10}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		q0, q0, q8
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, r10, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		q1, q1, q9
			
 
				+	mov		r5, r1			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{q8-q9}, [r0,:128]!
			
 
				+	veor		q0, q0, q8
			
 
				+	veor		q1, q1, q9
			
 
				+	vst1.8		{q0-q1}, [r8]!
			
 
				+
			
 
				+	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_1:
			
 
				+	mov		r0, sp
			
 
				+	veor		q0, q8
			
 
				+	mov		r1, sp
			
 
				+	vst1.8		{q0}, [sp,:128]
			
 
				+	mov		r2, r10
			
 
				+	mov		r4, r3				@ preserve fp
			
 
				+	mov		r5, r2			@ preserve magic
			
 
				+
			
 
				+	bl		AES_decrypt
			
 
				+
			
 
				+	vld1.8		{q0}, [sp,:128]
			
 
				+	veor		q0, q0, q8
			
 
				+	vst1.8		{q0}, [r8]!
			
 
				+	mov		r3, r4
			
 
				+	mov		r2, r5
			
 
				+
			
 
				+	vmov		q8, q9		@ next round tweak
			
 
				+
			
 
				+.Lxts_dec_done:
			
 
				+#ifndef	XTS_CHAIN_TWEAK
			
 
				+	adds		r9, #0x10
			
 
				+	beq		.Lxts_dec_ret
			
 
				+
			
 
				+	@ calculate one round of extra tweak for the stolen ciphertext
			
 
				+	vldmia		r2, {q5}
			
 
				+	vshr.s64	q6, q8, #63
			
 
				+	vand		q6, q6, q5
			
 
				+	vadd.u64	q9, q8, q8
			
 
				+	vswp		d13,d12
			
 
				+	veor		q9, q9, q6
			
 
				+
			
 
				+	@ perform the final decryption with the last tweak value
			
 
				+	vld1.8		{q0}, [r7]!
			
 
				+	mov		r0, sp
			
 
				+	veor		q0, q0, q9
			
 
				+	mov		r1, sp
			
 
				+	vst1.8		{q0}, [sp,:128]
			
 
				+	mov		r2, r10
			
 
				+	mov		r4, r3			@ preserve fp
			
 
				+
			
 
				+	bl		AES_decrypt
			
 
				+
			
 
				+	vld1.8		{q0}, [sp,:128]
			
 
				+	veor		q0, q0, q9
			
 
				+	vst1.8		{q0}, [r8]
			
 
				+
			
 
				+	mov		r6, r8
			
 
				+.Lxts_dec_steal:
			
 
				+	ldrb		r1, [r8]
			
 
				+	ldrb		r0, [r7], #1
			
 
				+	strb		r1, [r8, #0x10]
			
 
				+	strb		r0, [r8], #1
			
 
				+
			
 
				+	subs		r9, #1
			
 
				+	bhi		.Lxts_dec_steal
			
 
				+
			
 
				+	vld1.8		{q0}, [r6]
			
 
				+	mov		r0, sp
			
 
				+	veor		q0, q8
			
 
				+	mov		r1, sp
			
 
				+	vst1.8		{q0}, [sp,:128]
			
 
				+	mov		r2, r10
			
 
				+
			
 
				+	bl		AES_decrypt
			
 
				+
			
 
				+	vld1.8		{q0}, [sp,:128]
			
 
				+	veor		q0, q0, q8
			
 
				+	vst1.8		{q0}, [r6]
			
 
				+	mov		r3, r4
			
 
				+#endif
			
 
				+
			
 
				+.Lxts_dec_ret:
			
 
				+	bic		r0, r3, #0xf
			
 
				+	vmov.i32	q0, #0
			
 
				+	vmov.i32	q1, #0
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	ldr		r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
			
 
				+#endif
			
 
				+.Lxts_dec_bzero:				@ wipe key schedule [if any]
			
 
				+	vstmia		sp!, {q0-q1}
			
 
				+	cmp		sp, r0
			
 
				+	bne		.Lxts_dec_bzero
			
 
				+
			
 
				+	mov		sp, r3
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	vst1.8		{q8}, [r1]
			
 
				+#endif
			
 
				+	VFP_ABI_POP
			
 
				+	ldmia		sp!, {r4-r10, pc}	@ return
			
 
				+
			
 
				+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
			
 
				+#endif
			
--- a/arch/arm/crypto/aesbs-glue.c
+++ b/arch/arm/crypto/aesbs-glue.c
@@ -0,0 +1,434 @@
 
				+/*
			
 
				+ * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
			
 
				+ *
			
 
				+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <asm/neon.h>
			
 
				+#include <crypto/aes.h>
			
 
				+#include <crypto/ablk_helper.h>
			
 
				+#include <crypto/algapi.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+#include "aes_glue.h"
			
 
				+
			
 
				+#define BIT_SLICED_KEY_MAXSIZE	(128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
			
 
				+
			
 
				+struct BS_KEY {
			
 
				+	struct AES_KEY	rk;
			
 
				+	int		converted;
			
 
				+	u8 __aligned(8)	bs[BIT_SLICED_KEY_MAXSIZE];
			
 
				+} __aligned(8);
			
 
				+
			
 
				+asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
			
 
				+asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
			
 
				+
			
 
				+asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
			
 
				+				  struct BS_KEY *key, u8 iv[]);
			
 
				+
			
 
				+asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
			
 
				+					   struct BS_KEY *key, u8 const iv[]);
			
 
				+
			
 
				+asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
			
 
				+				  struct BS_KEY *key, u8 tweak[]);
			
 
				+
			
 
				+asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
			
 
				+				  struct BS_KEY *key, u8 tweak[]);
			
 
				+
			
 
				+struct aesbs_cbc_ctx {
			
 
				+	struct AES_KEY	enc;
			
 
				+	struct BS_KEY	dec;
			
 
				+};
			
 
				+
			
 
				+struct aesbs_ctr_ctx {
			
 
				+	struct BS_KEY	enc;
			
 
				+};
			
 
				+
			
 
				+struct aesbs_xts_ctx {
			
 
				+	struct BS_KEY	enc;
			
 
				+	struct BS_KEY	dec;
			
 
				+	struct AES_KEY	twkey;
			
 
				+};
			
 
				+
			
 
				+static int aesbs_cbc_set_key(struct crypto_tfm *tfm, const u8 *in_key,
			
 
				+			     unsigned int key_len)
			
 
				+{
			
 
				+	struct aesbs_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	int bits = key_len * 8;
			
 
				+
			
 
				+	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
			
 
				+		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+	ctx->dec.rk = ctx->enc;
			
 
				+	private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
			
 
				+	ctx->dec.converted = 0;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int aesbs_ctr_set_key(struct crypto_tfm *tfm, const u8 *in_key,
			
 
				+			     unsigned int key_len)
			
 
				+{
			
 
				+	struct aesbs_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	int bits = key_len * 8;
			
 
				+
			
 
				+	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
			
 
				+		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+	ctx->enc.converted = 0;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int aesbs_xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
			
 
				+			     unsigned int key_len)
			
 
				+{
			
 
				+	struct aesbs_xts_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	int bits = key_len * 4;
			
 
				+
			
 
				+	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
			
 
				+		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+	ctx->dec.rk = ctx->enc.rk;
			
 
				+	private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
			
 
				+	private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
			
 
				+	ctx->enc.converted = ctx->dec.converted = 0;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int aesbs_cbc_encrypt(struct blkcipher_desc *desc,
			
 
				+			     struct scatterlist *dst,
			
 
				+			     struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	struct blkcipher_walk walk;
			
 
				+	int err;
			
 
				+
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt(desc, &walk);
			
 
				+
			
 
				+	while (walk.nbytes) {
			
 
				+		u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+		u8 *src = walk.src.virt.addr;
			
 
				+
			
 
				+		if (walk.dst.virt.addr == walk.src.virt.addr) {
			
 
				+			u8 *iv = walk.iv;
			
 
				+
			
 
				+			do {
			
 
				+				crypto_xor(src, iv, AES_BLOCK_SIZE);
			
 
				+				AES_encrypt(src, src, &ctx->enc);
			
 
				+				iv = src;
			
 
				+				src += AES_BLOCK_SIZE;
			
 
				+			} while (--blocks);
			
 
				+			memcpy(walk.iv, iv, AES_BLOCK_SIZE);
			
 
				+		} else {
			
 
				+			u8 *dst = walk.dst.virt.addr;
			
 
				+
			
 
				+			do {
			
 
				+				crypto_xor(walk.iv, src, AES_BLOCK_SIZE);
			
 
				+				AES_encrypt(walk.iv, dst, &ctx->enc);
			
 
				+				memcpy(walk.iv, dst, AES_BLOCK_SIZE);
			
 
				+				src += AES_BLOCK_SIZE;
			
 
				+				dst += AES_BLOCK_SIZE;
			
 
				+			} while (--blocks);
			
 
				+		}
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int aesbs_cbc_decrypt(struct blkcipher_desc *desc,
			
 
				+			     struct scatterlist *dst,
			
 
				+			     struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct aesbs_cbc_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	struct blkcipher_walk walk;
			
 
				+	int err;
			
 
				+
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
			
 
				+
			
 
				+	while ((walk.nbytes / AES_BLOCK_SIZE) >= 8) {
			
 
				+		kernel_neon_begin();
			
 
				+		bsaes_cbc_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
			
 
				+				  walk.nbytes, &ctx->dec, walk.iv);
			
 
				+		kernel_neon_end();
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	while (walk.nbytes) {
			
 
				+		u32 blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+		u8 *dst = walk.dst.virt.addr;
			
 
				+		u8 *src = walk.src.virt.addr;
			
 
				+		u8 bk[2][AES_BLOCK_SIZE];
			
 
				+		u8 *iv = walk.iv;
			
 
				+
			
 
				+		do {
			
 
				+			if (walk.dst.virt.addr == walk.src.virt.addr)
			
 
				+				memcpy(bk[blocks & 1], src, AES_BLOCK_SIZE);
			
 
				+
			
 
				+			AES_decrypt(src, dst, &ctx->dec.rk);
			
 
				+			crypto_xor(dst, iv, AES_BLOCK_SIZE);
			
 
				+
			
 
				+			if (walk.dst.virt.addr == walk.src.virt.addr)
			
 
				+				iv = bk[blocks & 1];
			
 
				+			else
			
 
				+				iv = src;
			
 
				+
			
 
				+			dst += AES_BLOCK_SIZE;
			
 
				+			src += AES_BLOCK_SIZE;
			
 
				+		} while (--blocks);
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static void inc_be128_ctr(__be32 ctr[], u32 addend)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 3; i >= 0; i--, addend = 1) {
			
 
				+		u32 n = be32_to_cpu(ctr[i]) + addend;
			
 
				+
			
 
				+		ctr[i] = cpu_to_be32(n);
			
 
				+		if (n >= addend)
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int aesbs_ctr_encrypt(struct blkcipher_desc *desc,
			
 
				+			     struct scatterlist *dst, struct scatterlist *src,
			
 
				+			     unsigned int nbytes)
			
 
				+{
			
 
				+	struct aesbs_ctr_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	struct blkcipher_walk walk;
			
 
				+	u32 blocks;
			
 
				+	int err;
			
 
				+
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
			
 
				+
			
 
				+	while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
			
 
				+		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
			
 
				+		__be32 *ctr = (__be32 *)walk.iv;
			
 
				+		u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
			
 
				+
			
 
				+		/* avoid 32 bit counter overflow in the NEON code */
			
 
				+		if (unlikely(headroom < blocks)) {
			
 
				+			blocks = headroom + 1;
			
 
				+			tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
			
 
				+		}
			
 
				+		kernel_neon_begin();
			
 
				+		bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
			
 
				+					   walk.dst.virt.addr, blocks,
			
 
				+					   &ctx->enc, walk.iv);
			
 
				+		kernel_neon_end();
			
 
				+		inc_be128_ctr(ctr, blocks);
			
 
				+
			
 
				+		nbytes -= blocks * AES_BLOCK_SIZE;
			
 
				+		if (nbytes && nbytes == tail && nbytes <= AES_BLOCK_SIZE)
			
 
				+			break;
			
 
				+
			
 
				+		err = blkcipher_walk_done(desc, &walk, tail);
			
 
				+	}
			
 
				+	if (walk.nbytes) {
			
 
				+		u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
			
 
				+		u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
			
 
				+		u8 ks[AES_BLOCK_SIZE];
			
 
				+
			
 
				+		AES_encrypt(walk.iv, ks, &ctx->enc.rk);
			
 
				+		if (tdst != tsrc)
			
 
				+			memcpy(tdst, tsrc, nbytes);
			
 
				+		crypto_xor(tdst, ks, nbytes);
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int aesbs_xts_encrypt(struct blkcipher_desc *desc,
			
 
				+			     struct scatterlist *dst,
			
 
				+			     struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	struct blkcipher_walk walk;
			
 
				+	int err;
			
 
				+
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
			
 
				+
			
 
				+	/* generate the initial tweak */
			
 
				+	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
			
 
				+
			
 
				+	while (walk.nbytes) {
			
 
				+		kernel_neon_begin();
			
 
				+		bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
			
 
				+				  walk.nbytes, &ctx->enc, walk.iv);
			
 
				+		kernel_neon_end();
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int aesbs_xts_decrypt(struct blkcipher_desc *desc,
			
 
				+			     struct scatterlist *dst,
			
 
				+			     struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct aesbs_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	struct blkcipher_walk walk;
			
 
				+	int err;
			
 
				+
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt_block(desc, &walk, 8 * AES_BLOCK_SIZE);
			
 
				+
			
 
				+	/* generate the initial tweak */
			
 
				+	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
			
 
				+
			
 
				+	while (walk.nbytes) {
			
 
				+		kernel_neon_begin();
			
 
				+		bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
			
 
				+				  walk.nbytes, &ctx->dec, walk.iv);
			
 
				+		kernel_neon_end();
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static struct crypto_alg aesbs_algs[] = { {
			
 
				+	.cra_name		= "__cbc-aes-neonbs",
			
 
				+	.cra_driver_name	= "__driver-cbc-aes-neonbs",
			
 
				+	.cra_priority		= 0,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct aesbs_cbc_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_blkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_blkcipher = {
			
 
				+		.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= aesbs_cbc_set_key,
			
 
				+		.encrypt	= aesbs_cbc_encrypt,
			
 
				+		.decrypt	= aesbs_cbc_decrypt,
			
 
				+	},
			
 
				+}, {
			
 
				+	.cra_name		= "__ctr-aes-neonbs",
			
 
				+	.cra_driver_name	= "__driver-ctr-aes-neonbs",
			
 
				+	.cra_priority		= 0,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
			
 
				+	.cra_blocksize		= 1,
			
 
				+	.cra_ctxsize		= sizeof(struct aesbs_ctr_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_blkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_blkcipher = {
			
 
				+		.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= aesbs_ctr_set_key,
			
 
				+		.encrypt	= aesbs_ctr_encrypt,
			
 
				+		.decrypt	= aesbs_ctr_encrypt,
			
 
				+	},
			
 
				+}, {
			
 
				+	.cra_name		= "__xts-aes-neonbs",
			
 
				+	.cra_driver_name	= "__driver-xts-aes-neonbs",
			
 
				+	.cra_priority		= 0,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct aesbs_xts_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_blkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_blkcipher = {
			
 
				+		.min_keysize	= 2 * AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= 2 * AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= aesbs_xts_set_key,
			
 
				+		.encrypt	= aesbs_xts_encrypt,
			
 
				+		.decrypt	= aesbs_xts_decrypt,
			
 
				+	},
			
 
				+}, {
			
 
				+	.cra_name		= "cbc(aes)",
			
 
				+	.cra_driver_name	= "cbc-aes-neonbs",
			
 
				+	.cra_priority		= 300,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_ablkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_init		= ablk_init,
			
 
				+	.cra_exit		= ablk_exit,
			
 
				+	.cra_ablkcipher = {
			
 
				+		.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= ablk_set_key,
			
 
				+		.encrypt	= __ablk_encrypt,
			
 
				+		.decrypt	= ablk_decrypt,
			
 
				+	}
			
 
				+}, {
			
 
				+	.cra_name		= "ctr(aes)",
			
 
				+	.cra_driver_name	= "ctr-aes-neonbs",
			
 
				+	.cra_priority		= 300,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
			
 
				+	.cra_blocksize		= 1,
			
 
				+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_ablkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_init		= ablk_init,
			
 
				+	.cra_exit		= ablk_exit,
			
 
				+	.cra_ablkcipher = {
			
 
				+		.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= ablk_set_key,
			
 
				+		.encrypt	= ablk_encrypt,
			
 
				+		.decrypt	= ablk_decrypt,
			
 
				+	}
			
 
				+}, {
			
 
				+	.cra_name		= "xts(aes)",
			
 
				+	.cra_driver_name	= "xts-aes-neonbs",
			
 
				+	.cra_priority		= 300,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_ablkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_init		= ablk_init,
			
 
				+	.cra_exit		= ablk_exit,
			
 
				+	.cra_ablkcipher = {
			
 
				+		.min_keysize	= 2 * AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= 2 * AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= ablk_set_key,
			
 
				+		.encrypt	= ablk_encrypt,
			
 
				+		.decrypt	= ablk_decrypt,
			
 
				+	}
			
 
				+} };
			
 
				+
			
 
				+static int __init aesbs_mod_init(void)
			
 
				+{
			
 
				+	if (!cpu_has_neon())
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	return crypto_register_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
			
 
				+}
			
 
				+
			
 
				+static void __exit aesbs_mod_exit(void)
			
 
				+{
			
 
				+	crypto_unregister_algs(aesbs_algs, ARRAY_SIZE(aesbs_algs));
			
 
				+}
			
 
				+
			
 
				+module_init(aesbs_mod_init);
			
 
				+module_exit(aesbs_mod_exit);
			
 
				+
			
 
				+MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL");
			
--- a/arch/arm/crypto/bsaes-armv7.pl
+++ b/arch/arm/crypto/bsaes-armv7.pl
@@ -0,0 +1,2467 @@
 
				+#!/usr/bin/env perl
			
 
				+
			
 
				+# ====================================================================
			
 
				+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
			
 
				+# project. The module is, however, dual licensed under OpenSSL and
			
 
				+# CRYPTOGAMS licenses depending on where you obtain it. For further
			
 
				+# details see http://www.openssl.org/~appro/cryptogams/.
			
 
				+#
			
 
				+# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
			
 
				+# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
			
 
				+# granted.
			
 
				+# ====================================================================
			
 
				+
			
 
				+# Bit-sliced AES for ARM NEON
			
 
				+#
			
 
				+# February 2012.
			
 
				+#
			
 
				+# This implementation is direct adaptation of bsaes-x86_64 module for
			
 
				+# ARM NEON. Except that this module is endian-neutral [in sense that
			
 
				+# it can be compiled for either endianness] by courtesy of vld1.8's
			
 
				+# neutrality. Initial version doesn't implement interface to OpenSSL,
			
 
				+# only low-level primitives and unsupported entry points, just enough
			
 
				+# to collect performance results, which for Cortex-A8 core are:
			
 
				+#
			
 
				+# encrypt	19.5 cycles per byte processed with 128-bit key
			
 
				+# decrypt	22.1 cycles per byte processed with 128-bit key
			
 
				+# key conv.	440  cycles per 128-bit key/0.18 of 8x block
			
 
				+#
			
 
				+# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
			
 
				+# which is [much] worse than anticipated (for further details see
			
 
				+# http://www.openssl.org/~appro/Snapdragon-S4.html).
			
 
				+#
			
 
				+# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
			
 
				+# manages in 20.0 cycles].
			
 
				+#
			
 
				+# When comparing to x86_64 results keep in mind that NEON unit is
			
 
				+# [mostly] single-issue and thus can't [fully] benefit from
			
 
				+# instruction-level parallelism. And when comparing to aes-armv4
			
 
				+# results keep in mind key schedule conversion overhead (see
			
 
				+# bsaes-x86_64.pl for further details)...
			
 
				+#
			
 
				+#						<appro@openssl.org>
			
 
				+
			
 
				+# April-August 2013
			
 
				+#
			
 
				+# Add CBC, CTR and XTS subroutines, adapt for kernel use.
			
 
				+#
			
 
				+#					<ard.biesheuvel@linaro.org>
			
 
				+
			
 
				+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
			
 
				+open STDOUT,">$output";
			
 
				+
			
 
				+my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
			
 
				+my @XMM=map("q$_",(0..15));
			
 
				+
			
 
				+{
			
 
				+my ($key,$rounds,$const)=("r4","r5","r6");
			
 
				+
			
 
				+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
			
 
				+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
			
 
				+
			
 
				+sub Sbox {
			
 
				+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
			
 
				+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
			
 
				+my @b=@_[0..7];
			
 
				+my @t=@_[8..11];
			
 
				+my @s=@_[12..15];
			
 
				+	&InBasisChange	(@b);
			
 
				+	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
			
 
				+	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
			
 
				+}
			
 
				+
			
 
				+sub InBasisChange {
			
 
				+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
			
 
				+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
			
 
				+my @b=@_[0..7];
			
 
				+$code.=<<___;
			
 
				+	veor	@b[2], @b[2], @b[1]
			
 
				+	veor	@b[5], @b[5], @b[6]
			
 
				+	veor	@b[3], @b[3], @b[0]
			
 
				+	veor	@b[6], @b[6], @b[2]
			
 
				+	veor	@b[5], @b[5], @b[0]
			
 
				+
			
 
				+	veor	@b[6], @b[6], @b[3]
			
 
				+	veor	@b[3], @b[3], @b[7]
			
 
				+	veor	@b[7], @b[7], @b[5]
			
 
				+	veor	@b[3], @b[3], @b[4]
			
 
				+	veor	@b[4], @b[4], @b[5]
			
 
				+
			
 
				+	veor	@b[2], @b[2], @b[7]
			
 
				+	veor	@b[3], @b[3], @b[1]
			
 
				+	veor	@b[1], @b[1], @b[5]
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+sub OutBasisChange {
			
 
				+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
			
 
				+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
			
 
				+my @b=@_[0..7];
			
 
				+$code.=<<___;
			
 
				+	veor	@b[0], @b[0], @b[6]
			
 
				+	veor	@b[1], @b[1], @b[4]
			
 
				+	veor	@b[4], @b[4], @b[6]
			
 
				+	veor	@b[2], @b[2], @b[0]
			
 
				+	veor	@b[6], @b[6], @b[1]
			
 
				+
			
 
				+	veor	@b[1], @b[1], @b[5]
			
 
				+	veor	@b[5], @b[5], @b[3]
			
 
				+	veor	@b[3], @b[3], @b[7]
			
 
				+	veor	@b[7], @b[7], @b[5]
			
 
				+	veor	@b[2], @b[2], @b[5]
			
 
				+
			
 
				+	veor	@b[4], @b[4], @b[7]
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+sub InvSbox {
			
 
				+# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
			
 
				+# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
			
 
				+my @b=@_[0..7];
			
 
				+my @t=@_[8..11];
			
 
				+my @s=@_[12..15];
			
 
				+	&InvInBasisChange	(@b);
			
 
				+	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
			
 
				+	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
			
 
				+}
			
 
				+
			
 
				+sub InvInBasisChange {		# OutBasisChange in reverse (with twist)
			
 
				+my @b=@_[5,1,2,6,3,7,0,4];
			
 
				+$code.=<<___
			
 
				+	 veor	@b[1], @b[1], @b[7]
			
 
				+	veor	@b[4], @b[4], @b[7]
			
 
				+
			
 
				+	veor	@b[7], @b[7], @b[5]
			
 
				+	 veor	@b[1], @b[1], @b[3]
			
 
				+	veor	@b[2], @b[2], @b[5]
			
 
				+	veor	@b[3], @b[3], @b[7]
			
 
				+
			
 
				+	veor	@b[6], @b[6], @b[1]
			
 
				+	veor	@b[2], @b[2], @b[0]
			
 
				+	 veor	@b[5], @b[5], @b[3]
			
 
				+	veor	@b[4], @b[4], @b[6]
			
 
				+	veor	@b[0], @b[0], @b[6]
			
 
				+	veor	@b[1], @b[1], @b[4]
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+sub InvOutBasisChange {		# InBasisChange in reverse
			
 
				+my @b=@_[2,5,7,3,6,1,0,4];
			
 
				+$code.=<<___;
			
 
				+	veor	@b[1], @b[1], @b[5]
			
 
				+	veor	@b[2], @b[2], @b[7]
			
 
				+
			
 
				+	veor	@b[3], @b[3], @b[1]
			
 
				+	veor	@b[4], @b[4], @b[5]
			
 
				+	veor	@b[7], @b[7], @b[5]
			
 
				+	veor	@b[3], @b[3], @b[4]
			
 
				+	 veor 	@b[5], @b[5], @b[0]
			
 
				+	veor	@b[3], @b[3], @b[7]
			
 
				+	 veor	@b[6], @b[6], @b[2]
			
 
				+	 veor	@b[2], @b[2], @b[1]
			
 
				+	veor	@b[6], @b[6], @b[3]
			
 
				+
			
 
				+	veor	@b[3], @b[3], @b[0]
			
 
				+	veor	@b[5], @b[5], @b[6]
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+sub Mul_GF4 {
			
 
				+#;*************************************************************
			
 
				+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
			
 
				+#;*************************************************************
			
 
				+my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
			
 
				+$code.=<<___;
			
 
				+	veor 	$t0, $y0, $y1
			
 
				+	vand	$t0, $t0, $x0
			
 
				+	veor	$x0, $x0, $x1
			
 
				+	vand	$t1, $x1, $y0
			
 
				+	vand	$x0, $x0, $y1
			
 
				+	veor	$x1, $t1, $t0
			
 
				+	veor	$x0, $x0, $t1
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+sub Mul_GF4_N {				# not used, see next subroutine
			
 
				+# multiply and scale by N
			
 
				+my ($x0,$x1,$y0,$y1,$t0)=@_;
			
 
				+$code.=<<___;
			
 
				+	veor	$t0, $y0, $y1
			
 
				+	vand	$t0, $t0, $x0
			
 
				+	veor	$x0, $x0, $x1
			
 
				+	vand	$x1, $x1, $y0
			
 
				+	vand	$x0, $x0, $y1
			
 
				+	veor	$x1, $x1, $x0
			
 
				+	veor	$x0, $x0, $t0
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+sub Mul_GF4_N_GF4 {
			
 
				+# interleaved Mul_GF4_N and Mul_GF4
			
 
				+my ($x0,$x1,$y0,$y1,$t0,
			
 
				+    $x2,$x3,$y2,$y3,$t1)=@_;
			
 
				+$code.=<<___;
			
 
				+	veor	$t0, $y0, $y1
			
 
				+	 veor 	$t1, $y2, $y3
			
 
				+	vand	$t0, $t0, $x0
			
 
				+	 vand	$t1, $t1, $x2
			
 
				+	veor	$x0, $x0, $x1
			
 
				+	 veor	$x2, $x2, $x3
			
 
				+	vand	$x1, $x1, $y0
			
 
				+	 vand	$x3, $x3, $y2
			
 
				+	vand	$x0, $x0, $y1
			
 
				+	 vand	$x2, $x2, $y3
			
 
				+	veor	$x1, $x1, $x0
			
 
				+	 veor	$x2, $x2, $x3
			
 
				+	veor	$x0, $x0, $t0
			
 
				+	 veor	$x3, $x3, $t1
			
 
				+___
			
 
				+}
			
 
				+sub Mul_GF16_2 {
			
 
				+my @x=@_[0..7];
			
 
				+my @y=@_[8..11];
			
 
				+my @t=@_[12..15];
			
 
				+$code.=<<___;
			
 
				+	veor	@t[0], @x[0], @x[2]
			
 
				+	veor	@t[1], @x[1], @x[3]
			
 
				+___
			
 
				+	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2..3]);
			
 
				+$code.=<<___;
			
 
				+	veor	@y[0], @y[0], @y[2]
			
 
				+	veor	@y[1], @y[1], @y[3]
			
 
				+___
			
 
				+	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
			
 
				+			 @x[2], @x[3], @y[2], @y[3], @t[2]);
			
 
				+$code.=<<___;
			
 
				+	veor	@x[0], @x[0], @t[0]
			
 
				+	veor	@x[2], @x[2], @t[0]
			
 
				+	veor	@x[1], @x[1], @t[1]
			
 
				+	veor	@x[3], @x[3], @t[1]
			
 
				+
			
 
				+	veor	@t[0], @x[4], @x[6]
			
 
				+	veor	@t[1], @x[5], @x[7]
			
 
				+___
			
 
				+	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
			
 
				+			 @x[6], @x[7], @y[2], @y[3], @t[2]);
			
 
				+$code.=<<___;
			
 
				+	veor	@y[0], @y[0], @y[2]
			
 
				+	veor	@y[1], @y[1], @y[3]
			
 
				+___
			
 
				+	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[2..3]);
			
 
				+$code.=<<___;
			
 
				+	veor	@x[4], @x[4], @t[0]
			
 
				+	veor	@x[6], @x[6], @t[0]
			
 
				+	veor	@x[5], @x[5], @t[1]
			
 
				+	veor	@x[7], @x[7], @t[1]
			
 
				+___
			
 
				+}
			
 
				+sub Inv_GF256 {
			
 
				+#;********************************************************************
			
 
				+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
			
 
				+#;********************************************************************
			
 
				+my @x=@_[0..7];
			
 
				+my @t=@_[8..11];
			
 
				+my @s=@_[12..15];
			
 
				+# direct optimizations from hardware
			
 
				+$code.=<<___;
			
 
				+	veor	@t[3], @x[4], @x[6]
			
 
				+	veor	@t[2], @x[5], @x[7]
			
 
				+	veor	@t[1], @x[1], @x[3]
			
 
				+	veor	@s[1], @x[7], @x[6]
			
 
				+	 vmov	@t[0], @t[2]
			
 
				+	veor	@s[0], @x[0], @x[2]
			
 
				+
			
 
				+	vorr	@t[2], @t[2], @t[1]
			
 
				+	veor	@s[3], @t[3], @t[0]
			
 
				+	vand	@s[2], @t[3], @s[0]
			
 
				+	vorr	@t[3], @t[3], @s[0]
			
 
				+	veor	@s[0], @s[0], @t[1]
			
 
				+	vand	@t[0], @t[0], @t[1]
			
 
				+	veor	@t[1], @x[3], @x[2]
			
 
				+	vand	@s[3], @s[3], @s[0]
			
 
				+	vand	@s[1], @s[1], @t[1]
			
 
				+	veor	@t[1], @x[4], @x[5]
			
 
				+	veor	@s[0], @x[1], @x[0]
			
 
				+	veor	@t[3], @t[3], @s[1]
			
 
				+	veor	@t[2], @t[2], @s[1]
			
 
				+	vand	@s[1], @t[1], @s[0]
			
 
				+	vorr	@t[1], @t[1], @s[0]
			
 
				+	veor	@t[3], @t[3], @s[3]
			
 
				+	veor	@t[0], @t[0], @s[1]
			
 
				+	veor	@t[2], @t[2], @s[2]
			
 
				+	veor	@t[1], @t[1], @s[3]
			
 
				+	veor	@t[0], @t[0], @s[2]
			
 
				+	vand	@s[0], @x[7], @x[3]
			
 
				+	veor	@t[1], @t[1], @s[2]
			
 
				+	vand	@s[1], @x[6], @x[2]
			
 
				+	vand	@s[2], @x[5], @x[1]
			
 
				+	vorr	@s[3], @x[4], @x[0]
			
 
				+	veor	@t[3], @t[3], @s[0]
			
 
				+	veor	@t[1], @t[1], @s[2]
			
 
				+	veor	@t[0], @t[0], @s[3]
			
 
				+	veor	@t[2], @t[2], @s[1]
			
 
				+
			
 
				+	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
			
 
				+
			
 
				+	@ new smaller inversion
			
 
				+
			
 
				+	vand	@s[2], @t[3], @t[1]
			
 
				+	vmov	@s[0], @t[0]
			
 
				+
			
 
				+	veor	@s[1], @t[2], @s[2]
			
 
				+	veor	@s[3], @t[0], @s[2]
			
 
				+	veor	@s[2], @t[0], @s[2]	@ @s[2]=@s[3]
			
 
				+
			
 
				+	vbsl	@s[1], @t[1], @t[0]
			
 
				+	vbsl	@s[3], @t[3], @t[2]
			
 
				+	veor	@t[3], @t[3], @t[2]
			
 
				+
			
 
				+	vbsl	@s[0], @s[1], @s[2]
			
 
				+	vbsl	@t[0], @s[2], @s[1]
			
 
				+
			
 
				+	vand	@s[2], @s[0], @s[3]
			
 
				+	veor	@t[1], @t[1], @t[0]
			
 
				+
			
 
				+	veor	@s[2], @s[2], @t[3]
			
 
				+___
			
 
				+# output in s3, s2, s1, t1
			
 
				+
			
 
				+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
			
 
				+
			
 
				+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
			
 
				+	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
			
 
				+
			
 
				+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
			
 
				+}
			
 
				+
			
 
				+# AES linear components
			
 
				+
			
 
				+sub ShiftRows {
			
 
				+my @x=@_[0..7];
			
 
				+my @t=@_[8..11];
			
 
				+my $mask=pop;
			
 
				+$code.=<<___;
			
 
				+	vldmia	$key!, {@t[0]-@t[3]}
			
 
				+	veor	@t[0], @t[0], @x[0]
			
 
				+	veor	@t[1], @t[1], @x[1]
			
 
				+	vtbl.8	`&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
			
 
				+	vtbl.8	`&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
			
 
				+	vldmia	$key!, {@t[0]}
			
 
				+	veor	@t[2], @t[2], @x[2]
			
 
				+	vtbl.8	`&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
			
 
				+	vtbl.8	`&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
			
 
				+	vldmia	$key!, {@t[1]}
			
 
				+	veor	@t[3], @t[3], @x[3]
			
 
				+	vtbl.8	`&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
			
 
				+	vtbl.8	`&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
			
 
				+	vldmia	$key!, {@t[2]}
			
 
				+	vtbl.8	`&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
			
 
				+	vtbl.8	`&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
			
 
				+	vldmia	$key!, {@t[3]}
			
 
				+	veor	@t[0], @t[0], @x[4]
			
 
				+	veor	@t[1], @t[1], @x[5]
			
 
				+	vtbl.8	`&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
			
 
				+	vtbl.8	`&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
			
 
				+	veor	@t[2], @t[2], @x[6]
			
 
				+	vtbl.8	`&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
			
 
				+	vtbl.8	`&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
			
 
				+	veor	@t[3], @t[3], @x[7]
			
 
				+	vtbl.8	`&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
			
 
				+	vtbl.8	`&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
			
 
				+	vtbl.8	`&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
			
 
				+	vtbl.8	`&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+sub MixColumns {
			
 
				+# modified to emit output in order suitable for feeding back to aesenc[last]
			
 
				+my @x=@_[0..7];
			
 
				+my @t=@_[8..15];
			
 
				+my $inv=@_[16];	# optional
			
 
				+$code.=<<___;
			
 
				+	vext.8	@t[0], @x[0], @x[0], #12	@ x0 <<< 32
			
 
				+	vext.8	@t[1], @x[1], @x[1], #12
			
 
				+	 veor	@x[0], @x[0], @t[0]		@ x0 ^ (x0 <<< 32)
			
 
				+	vext.8	@t[2], @x[2], @x[2], #12
			
 
				+	 veor	@x[1], @x[1], @t[1]
			
 
				+	vext.8	@t[3], @x[3], @x[3], #12
			
 
				+	 veor	@x[2], @x[2], @t[2]
			
 
				+	vext.8	@t[4], @x[4], @x[4], #12
			
 
				+	 veor	@x[3], @x[3], @t[3]
			
 
				+	vext.8	@t[5], @x[5], @x[5], #12
			
 
				+	 veor	@x[4], @x[4], @t[4]
			
 
				+	vext.8	@t[6], @x[6], @x[6], #12
			
 
				+	 veor	@x[5], @x[5], @t[5]
			
 
				+	vext.8	@t[7], @x[7], @x[7], #12
			
 
				+	 veor	@x[6], @x[6], @t[6]
			
 
				+
			
 
				+	veor	@t[1], @t[1], @x[0]
			
 
				+	 veor	@x[7], @x[7], @t[7]
			
 
				+	 vext.8	@x[0], @x[0], @x[0], #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
			
 
				+	veor	@t[2], @t[2], @x[1]
			
 
				+	veor	@t[0], @t[0], @x[7]
			
 
				+	veor	@t[1], @t[1], @x[7]
			
 
				+	 vext.8	@x[1], @x[1], @x[1], #8
			
 
				+	veor	@t[5], @t[5], @x[4]
			
 
				+	 veor	@x[0], @x[0], @t[0]
			
 
				+	veor	@t[6], @t[6], @x[5]
			
 
				+	 veor	@x[1], @x[1], @t[1]
			
 
				+	 vext.8	@t[0], @x[4], @x[4], #8
			
 
				+	veor	@t[4], @t[4], @x[3]
			
 
				+	 vext.8	@t[1], @x[5], @x[5], #8
			
 
				+	veor	@t[7], @t[7], @x[6]
			
 
				+	 vext.8	@x[4], @x[3], @x[3], #8
			
 
				+	veor	@t[3], @t[3], @x[2]
			
 
				+	 vext.8	@x[5], @x[7], @x[7], #8
			
 
				+	veor	@t[4], @t[4], @x[7]
			
 
				+	 vext.8	@x[3], @x[6], @x[6], #8
			
 
				+	veor	@t[3], @t[3], @x[7]
			
 
				+	 vext.8	@x[6], @x[2], @x[2], #8
			
 
				+	veor	@x[7], @t[1], @t[5]
			
 
				+___
			
 
				+$code.=<<___ if (!$inv);
			
 
				+	veor	@x[2], @t[0], @t[4]
			
 
				+	veor	@x[4], @x[4], @t[3]
			
 
				+	veor	@x[5], @x[5], @t[7]
			
 
				+	veor	@x[3], @x[3], @t[6]
			
 
				+	 @ vmov	@x[2], @t[0]
			
 
				+	veor	@x[6], @x[6], @t[2]
			
 
				+	 @ vmov	@x[7], @t[1]
			
 
				+___
			
 
				+$code.=<<___ if ($inv);
			
 
				+	veor	@t[3], @t[3], @x[4]
			
 
				+	veor	@x[5], @x[5], @t[7]
			
 
				+	veor	@x[2], @x[3], @t[6]
			
 
				+	veor	@x[3], @t[0], @t[4]
			
 
				+	veor	@x[4], @x[6], @t[2]
			
 
				+	vmov	@x[6], @t[3]
			
 
				+	 @ vmov	@x[7], @t[1]
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+sub InvMixColumns_orig {
			
 
				+my @x=@_[0..7];
			
 
				+my @t=@_[8..15];
			
 
				+
			
 
				+$code.=<<___;
			
 
				+	@ multiplication by 0x0e
			
 
				+	vext.8	@t[7], @x[7], @x[7], #12
			
 
				+	vmov	@t[2], @x[2]
			
 
				+	veor	@x[2], @x[2], @x[5]		@ 2 5
			
 
				+	veor	@x[7], @x[7], @x[5]		@ 7 5
			
 
				+	vext.8	@t[0], @x[0], @x[0], #12
			
 
				+	vmov	@t[5], @x[5]
			
 
				+	veor	@x[5], @x[5], @x[0]		@ 5 0		[1]
			
 
				+	veor	@x[0], @x[0], @x[1]		@ 0 1
			
 
				+	vext.8	@t[1], @x[1], @x[1], #12
			
 
				+	veor	@x[1], @x[1], @x[2]		@ 1 25
			
 
				+	veor	@x[0], @x[0], @x[6]		@ 01 6		[2]
			
 
				+	vext.8	@t[3], @x[3], @x[3], #12
			
 
				+	veor	@x[1], @x[1], @x[3]		@ 125 3		[4]
			
 
				+	veor	@x[2], @x[2], @x[0]		@ 25 016	[3]
			
 
				+	veor	@x[3], @x[3], @x[7]		@ 3 75
			
 
				+	veor	@x[7], @x[7], @x[6]		@ 75 6		[0]
			
 
				+	vext.8	@t[6], @x[6], @x[6], #12
			
 
				+	vmov	@t[4], @x[4]
			
 
				+	veor	@x[6], @x[6], @x[4]		@ 6 4
			
 
				+	veor	@x[4], @x[4], @x[3]		@ 4 375		[6]
			
 
				+	veor	@x[3], @x[3], @x[7]		@ 375 756=36
			
 
				+	veor	@x[6], @x[6], @t[5]		@ 64 5		[7]
			
 
				+	veor	@x[3], @x[3], @t[2]		@ 36 2
			
 
				+	vext.8	@t[5], @t[5], @t[5], #12
			
 
				+	veor	@x[3], @x[3], @t[4]		@ 362 4		[5]
			
 
				+___
			
 
				+					my @y = @x[7,5,0,2,1,3,4,6];
			
 
				+$code.=<<___;
			
 
				+	@ multiplication by 0x0b
			
 
				+	veor	@y[1], @y[1], @y[0]
			
 
				+	veor	@y[0], @y[0], @t[0]
			
 
				+	vext.8	@t[2], @t[2], @t[2], #12
			
 
				+	veor	@y[1], @y[1], @t[1]
			
 
				+	veor	@y[0], @y[0], @t[5]
			
 
				+	vext.8	@t[4], @t[4], @t[4], #12
			
 
				+	veor	@y[1], @y[1], @t[6]
			
 
				+	veor	@y[0], @y[0], @t[7]
			
 
				+	veor	@t[7], @t[7], @t[6]		@ clobber t[7]
			
 
				+
			
 
				+	veor	@y[3], @y[3], @t[0]
			
 
				+	 veor	@y[1], @y[1], @y[0]
			
 
				+	vext.8	@t[0], @t[0], @t[0], #12
			
 
				+	veor	@y[2], @y[2], @t[1]
			
 
				+	veor	@y[4], @y[4], @t[1]
			
 
				+	vext.8	@t[1], @t[1], @t[1], #12
			
 
				+	veor	@y[2], @y[2], @t[2]
			
 
				+	veor	@y[3], @y[3], @t[2]
			
 
				+	veor	@y[5], @y[5], @t[2]
			
 
				+	veor	@y[2], @y[2], @t[7]
			
 
				+	vext.8	@t[2], @t[2], @t[2], #12
			
 
				+	veor	@y[3], @y[3], @t[3]
			
 
				+	veor	@y[6], @y[6], @t[3]
			
 
				+	veor	@y[4], @y[4], @t[3]
			
 
				+	veor	@y[7], @y[7], @t[4]
			
 
				+	vext.8	@t[3], @t[3], @t[3], #12
			
 
				+	veor	@y[5], @y[5], @t[4]
			
 
				+	veor	@y[7], @y[7], @t[7]
			
 
				+	veor	@t[7], @t[7], @t[5]		@ clobber t[7] even more
			
 
				+	veor	@y[3], @y[3], @t[5]
			
 
				+	veor	@y[4], @y[4], @t[4]
			
 
				+
			
 
				+	veor	@y[5], @y[5], @t[7]
			
 
				+	vext.8	@t[4], @t[4], @t[4], #12
			
 
				+	veor	@y[6], @y[6], @t[7]
			
 
				+	veor	@y[4], @y[4], @t[7]
			
 
				+
			
 
				+	veor	@t[7], @t[7], @t[5]
			
 
				+	vext.8	@t[5], @t[5], @t[5], #12
			
 
				+
			
 
				+	@ multiplication by 0x0d
			
 
				+	veor	@y[4], @y[4], @y[7]
			
 
				+	 veor	@t[7], @t[7], @t[6]		@ restore t[7]
			
 
				+	veor	@y[7], @y[7], @t[4]
			
 
				+	vext.8	@t[6], @t[6], @t[6], #12
			
 
				+	veor	@y[2], @y[2], @t[0]
			
 
				+	veor	@y[7], @y[7], @t[5]
			
 
				+	vext.8	@t[7], @t[7], @t[7], #12
			
 
				+	veor	@y[2], @y[2], @t[2]
			
 
				+
			
 
				+	veor	@y[3], @y[3], @y[1]
			
 
				+	veor	@y[1], @y[1], @t[1]
			
 
				+	veor	@y[0], @y[0], @t[0]
			
 
				+	veor	@y[3], @y[3], @t[0]
			
 
				+	veor	@y[1], @y[1], @t[5]
			
 
				+	veor	@y[0], @y[0], @t[5]
			
 
				+	vext.8	@t[0], @t[0], @t[0], #12
			
 
				+	veor	@y[1], @y[1], @t[7]
			
 
				+	veor	@y[0], @y[0], @t[6]
			
 
				+	veor	@y[3], @y[3], @y[1]
			
 
				+	veor	@y[4], @y[4], @t[1]
			
 
				+	vext.8	@t[1], @t[1], @t[1], #12
			
 
				+
			
 
				+	veor	@y[7], @y[7], @t[7]
			
 
				+	veor	@y[4], @y[4], @t[2]
			
 
				+	veor	@y[5], @y[5], @t[2]
			
 
				+	veor	@y[2], @y[2], @t[6]
			
 
				+	veor	@t[6], @t[6], @t[3]		@ clobber t[6]
			
 
				+	vext.8	@t[2], @t[2], @t[2], #12
			
 
				+	veor	@y[4], @y[4], @y[7]
			
 
				+	veor	@y[3], @y[3], @t[6]
			
 
				+
			
 
				+	veor	@y[6], @y[6], @t[6]
			
 
				+	veor	@y[5], @y[5], @t[5]
			
 
				+	vext.8	@t[5], @t[5], @t[5], #12
			
 
				+	veor	@y[6], @y[6], @t[4]
			
 
				+	vext.8	@t[4], @t[4], @t[4], #12
			
 
				+	veor	@y[5], @y[5], @t[6]
			
 
				+	veor	@y[6], @y[6], @t[7]
			
 
				+	vext.8	@t[7], @t[7], @t[7], #12
			
 
				+	veor	@t[6], @t[6], @t[3]		@ restore t[6]
			
 
				+	vext.8	@t[3], @t[3], @t[3], #12
			
 
				+
			
 
				+	@ multiplication by 0x09
			
 
				+	veor	@y[4], @y[4], @y[1]
			
 
				+	veor	@t[1], @t[1], @y[1]		@ t[1]=y[1]
			
 
				+	veor	@t[0], @t[0], @t[5]		@ clobber t[0]
			
 
				+	vext.8	@t[6], @t[6], @t[6], #12
			
 
				+	veor	@t[1], @t[1], @t[5]
			
 
				+	veor	@y[3], @y[3], @t[0]
			
 
				+	veor	@t[0], @t[0], @y[0]		@ t[0]=y[0]
			
 
				+	veor	@t[1], @t[1], @t[6]
			
 
				+	veor	@t[6], @t[6], @t[7]		@ clobber t[6]
			
 
				+	veor	@y[4], @y[4], @t[1]
			
 
				+	veor	@y[7], @y[7], @t[4]
			
 
				+	veor	@y[6], @y[6], @t[3]
			
 
				+	veor	@y[5], @y[5], @t[2]
			
 
				+	veor	@t[4], @t[4], @y[4]		@ t[4]=y[4]
			
 
				+	veor	@t[3], @t[3], @y[3]		@ t[3]=y[3]
			
 
				+	veor	@t[5], @t[5], @y[5]		@ t[5]=y[5]
			
 
				+	veor	@t[2], @t[2], @y[2]		@ t[2]=y[2]
			
 
				+	veor	@t[3], @t[3], @t[7]
			
 
				+	veor	@XMM[5], @t[5], @t[6]
			
 
				+	veor	@XMM[6], @t[6], @y[6]		@ t[6]=y[6]
			
 
				+	veor	@XMM[2], @t[2], @t[6]
			
 
				+	veor	@XMM[7], @t[7], @y[7]		@ t[7]=y[7]
			
 
				+
			
 
				+	vmov	@XMM[0], @t[0]
			
 
				+	vmov	@XMM[1], @t[1]
			
 
				+	@ vmov	@XMM[2], @t[2]
			
 
				+	vmov	@XMM[3], @t[3]
			
 
				+	vmov	@XMM[4], @t[4]
			
 
				+	@ vmov	@XMM[5], @t[5]
			
 
				+	@ vmov	@XMM[6], @t[6]
			
 
				+	@ vmov	@XMM[7], @t[7]
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+sub InvMixColumns {
			
 
				+my @x=@_[0..7];
			
 
				+my @t=@_[8..15];
			
 
				+
			
 
				+# Thanks to Jussi Kivilinna for providing pointer to
			
 
				+#
			
 
				+# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
			
 
				+# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
			
 
				+# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
			
 
				+# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
			
 
				+
			
 
				+$code.=<<___;
			
 
				+	@ multiplication by 0x05-0x00-0x04-0x00
			
 
				+	vext.8	@t[0], @x[0], @x[0], #8
			
 
				+	vext.8	@t[6], @x[6], @x[6], #8
			
 
				+	vext.8	@t[7], @x[7], @x[7], #8
			
 
				+	veor	@t[0], @t[0], @x[0]
			
 
				+	vext.8	@t[1], @x[1], @x[1], #8
			
 
				+	veor	@t[6], @t[6], @x[6]
			
 
				+	vext.8	@t[2], @x[2], @x[2], #8
			
 
				+	veor	@t[7], @t[7], @x[7]
			
 
				+	vext.8	@t[3], @x[3], @x[3], #8
			
 
				+	veor	@t[1], @t[1], @x[1]
			
 
				+	vext.8	@t[4], @x[4], @x[4], #8
			
 
				+	veor	@t[2], @t[2], @x[2]
			
 
				+	vext.8	@t[5], @x[5], @x[5], #8
			
 
				+	veor	@t[3], @t[3], @x[3]
			
 
				+	veor	@t[4], @t[4], @x[4]
			
 
				+	veor	@t[5], @t[5], @x[5]
			
 
				+
			
 
				+	 veor	@x[0], @x[0], @t[6]
			
 
				+	 veor	@x[1], @x[1], @t[6]
			
 
				+	 veor	@x[2], @x[2], @t[0]
			
 
				+	 veor	@x[4], @x[4], @t[2]
			
 
				+	 veor	@x[3], @x[3], @t[1]
			
 
				+	 veor	@x[1], @x[1], @t[7]
			
 
				+	 veor	@x[2], @x[2], @t[7]
			
 
				+	 veor	@x[4], @x[4], @t[6]
			
 
				+	 veor	@x[5], @x[5], @t[3]
			
 
				+	 veor	@x[3], @x[3], @t[6]
			
 
				+	 veor	@x[6], @x[6], @t[4]
			
 
				+	 veor	@x[4], @x[4], @t[7]
			
 
				+	 veor	@x[5], @x[5], @t[7]
			
 
				+	 veor	@x[7], @x[7], @t[5]
			
 
				+___
			
 
				+	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
			
 
				+}
			
 
				+
			
 
				+sub swapmove {
			
 
				+my ($a,$b,$n,$mask,$t)=@_;
			
 
				+$code.=<<___;
			
 
				+	vshr.u64	$t, $b, #$n
			
 
				+	veor		$t, $t, $a
			
 
				+	vand		$t, $t, $mask
			
 
				+	veor		$a, $a, $t
			
 
				+	vshl.u64	$t, $t, #$n
			
 
				+	veor		$b, $b, $t
			
 
				+___
			
 
				+}
			
 
				+sub swapmove2x {
			
 
				+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
			
 
				+$code.=<<___;
			
 
				+	vshr.u64	$t0, $b0, #$n
			
 
				+	 vshr.u64	$t1, $b1, #$n
			
 
				+	veor		$t0, $t0, $a0
			
 
				+	 veor		$t1, $t1, $a1
			
 
				+	vand		$t0, $t0, $mask
			
 
				+	 vand		$t1, $t1, $mask
			
 
				+	veor		$a0, $a0, $t0
			
 
				+	vshl.u64	$t0, $t0, #$n
			
 
				+	 veor		$a1, $a1, $t1
			
 
				+	 vshl.u64	$t1, $t1, #$n
			
 
				+	veor		$b0, $b0, $t0
			
 
				+	 veor		$b1, $b1, $t1
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+sub bitslice {
			
 
				+my @x=reverse(@_[0..7]);
			
 
				+my ($t0,$t1,$t2,$t3)=@_[8..11];
			
 
				+$code.=<<___;
			
 
				+	vmov.i8	$t0,#0x55			@ compose .LBS0
			
 
				+	vmov.i8	$t1,#0x33			@ compose .LBS1
			
 
				+___
			
 
				+	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
			
 
				+	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
			
 
				+$code.=<<___;
			
 
				+	vmov.i8	$t0,#0x0f			@ compose .LBS2
			
 
				+___
			
 
				+	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
			
 
				+	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
			
 
				+
			
 
				+	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
			
 
				+	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
			
 
				+}
			
 
				+
			
 
				+$code.=<<___;
			
 
				+#ifndef __KERNEL__
			
 
				+# include "arm_arch.h"
			
 
				+
			
 
				+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
			
 
				+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
			
 
				+# define VFP_ABI_FRAME	0x40
			
 
				+#else
			
 
				+# define VFP_ABI_PUSH
			
 
				+# define VFP_ABI_POP
			
 
				+# define VFP_ABI_FRAME	0
			
 
				+# define BSAES_ASM_EXTENDED_KEY
			
 
				+# define XTS_CHAIN_TWEAK
			
 
				+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __thumb__
			
 
				+# define adrl adr
			
 
				+#endif
			
 
				+
			
 
				+#if __ARM_ARCH__>=7
			
 
				+.text
			
 
				+.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
			
 
				+#ifdef __thumb2__
			
 
				+.thumb
			
 
				+#else
			
 
				+.code   32
			
 
				+#endif
			
 
				+
			
 
				+.fpu	neon
			
 
				+
			
 
				+.type	_bsaes_decrypt8,%function
			
 
				+.align	4
			
 
				+_bsaes_decrypt8:
			
 
				+	adr	$const,_bsaes_decrypt8
			
 
				+	vldmia	$key!, {@XMM[9]}		@ round 0 key
			
 
				+	add	$const,$const,#.LM0ISR-_bsaes_decrypt8
			
 
				+
			
 
				+	vldmia	$const!, {@XMM[8]}		@ .LM0ISR
			
 
				+	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
			
 
				+	veor	@XMM[11], @XMM[1], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[12], @XMM[2], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[13], @XMM[3], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[14], @XMM[4], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[15], @XMM[5], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[10], @XMM[6], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[11], @XMM[7], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
			
 
				+	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
			
 
				+___
			
 
				+	&bitslice	(@XMM[0..7, 8..11]);
			
 
				+$code.=<<___;
			
 
				+	sub	$rounds,$rounds,#1
			
 
				+	b	.Ldec_sbox
			
 
				+.align	4
			
 
				+.Ldec_loop:
			
 
				+___
			
 
				+	&ShiftRows	(@XMM[0..7, 8..12]);
			
 
				+$code.=".Ldec_sbox:\n";
			
 
				+	&InvSbox	(@XMM[0..7, 8..15]);
			
 
				+$code.=<<___;
			
 
				+	subs	$rounds,$rounds,#1
			
 
				+	bcc	.Ldec_done
			
 
				+___
			
 
				+	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
			
 
				+$code.=<<___;
			
 
				+	vldmia	$const, {@XMM[12]}		@ .LISR
			
 
				+	ite	eq				@ Thumb2 thing, sanity check in ARM
			
 
				+	addeq	$const,$const,#0x10
			
 
				+	bne	.Ldec_loop
			
 
				+	vldmia	$const, {@XMM[12]}		@ .LISRM0
			
 
				+	b	.Ldec_loop
			
 
				+.align	4
			
 
				+.Ldec_done:
			
 
				+___
			
 
				+	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
			
 
				+$code.=<<___;
			
 
				+	vldmia	$key, {@XMM[8]}			@ last round key
			
 
				+	veor	@XMM[6], @XMM[6], @XMM[8]
			
 
				+	veor	@XMM[4], @XMM[4], @XMM[8]
			
 
				+	veor	@XMM[2], @XMM[2], @XMM[8]
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[8]
			
 
				+	veor	@XMM[3], @XMM[3], @XMM[8]
			
 
				+	veor	@XMM[5], @XMM[5], @XMM[8]
			
 
				+	veor	@XMM[0], @XMM[0], @XMM[8]
			
 
				+	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				+	bx	lr
			
 
				+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
			
 
				+
			
 
				+.type	_bsaes_const,%object
			
 
				+.align	6
			
 
				+_bsaes_const:
			
 
				+.LM0ISR:	@ InvShiftRows constants
			
 
				+	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
			
 
				+.LISR:
			
 
				+	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
			
 
				+.LISRM0:
			
 
				+	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
			
 
				+.LM0SR:		@ ShiftRows constants
			
 
				+	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
			
 
				+.LSR:
			
 
				+	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
			
 
				+.LSRM0:
			
 
				+	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
			
 
				+.LM0:
			
 
				+	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
			
 
				+.LREVM0SR:
			
 
				+	.quad	0x090d01050c000408, 0x03070b0f060a0e02
			
 
				+.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
			
 
				+.align	6
			
 
				+.size	_bsaes_const,.-_bsaes_const
			
 
				+
			
 
				+.type	_bsaes_encrypt8,%function
			
 
				+.align	4
			
 
				+_bsaes_encrypt8:
			
 
				+	adr	$const,_bsaes_encrypt8
			
 
				+	vldmia	$key!, {@XMM[9]}		@ round 0 key
			
 
				+	sub	$const,$const,#_bsaes_encrypt8-.LM0SR
			
 
				+
			
 
				+	vldmia	$const!, {@XMM[8]}		@ .LM0SR
			
 
				+_bsaes_encrypt8_alt:
			
 
				+	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
			
 
				+	veor	@XMM[11], @XMM[1], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[12], @XMM[2], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[13], @XMM[3], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[14], @XMM[4], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[15], @XMM[5], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[10], @XMM[6], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
			
 
				+	veor	@XMM[11], @XMM[7], @XMM[9]
			
 
				+	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
			
 
				+	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
			
 
				+	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
			
 
				+_bsaes_encrypt8_bitslice:
			
 
				+___
			
 
				+	&bitslice	(@XMM[0..7, 8..11]);
			
 
				+$code.=<<___;
			
 
				+	sub	$rounds,$rounds,#1
			
 
				+	b	.Lenc_sbox
			
 
				+.align	4
			
 
				+.Lenc_loop:
			
 
				+___
			
 
				+	&ShiftRows	(@XMM[0..7, 8..12]);
			
 
				+$code.=".Lenc_sbox:\n";
			
 
				+	&Sbox		(@XMM[0..7, 8..15]);
			
 
				+$code.=<<___;
			
 
				+	subs	$rounds,$rounds,#1
			
 
				+	bcc	.Lenc_done
			
 
				+___
			
 
				+	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
			
 
				+$code.=<<___;
			
 
				+	vldmia	$const, {@XMM[12]}		@ .LSR
			
 
				+	ite	eq				@ Thumb2 thing, samity check in ARM
			
 
				+	addeq	$const,$const,#0x10
			
 
				+	bne	.Lenc_loop
			
 
				+	vldmia	$const, {@XMM[12]}		@ .LSRM0
			
 
				+	b	.Lenc_loop
			
 
				+.align	4
			
 
				+.Lenc_done:
			
 
				+___
			
 
				+	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
			
 
				+	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
			
 
				+$code.=<<___;
			
 
				+	vldmia	$key, {@XMM[8]}			@ last round key
			
 
				+	veor	@XMM[4], @XMM[4], @XMM[8]
			
 
				+	veor	@XMM[6], @XMM[6], @XMM[8]
			
 
				+	veor	@XMM[3], @XMM[3], @XMM[8]
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[8]
			
 
				+	veor	@XMM[2], @XMM[2], @XMM[8]
			
 
				+	veor	@XMM[5], @XMM[5], @XMM[8]
			
 
				+	veor	@XMM[0], @XMM[0], @XMM[8]
			
 
				+	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				+	bx	lr
			
 
				+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
			
 
				+___
			
 
				+}
			
 
				+{
			
 
				+my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
			
 
				+
			
 
				+sub bitslice_key {
			
 
				+my @x=reverse(@_[0..7]);
			
 
				+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
			
 
				+
			
 
				+	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
			
 
				+$code.=<<___;
			
 
				+	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
			
 
				+	vmov	@x[2], @x[0]
			
 
				+	vmov	@x[3], @x[1]
			
 
				+___
			
 
				+	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
			
 
				+
			
 
				+	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
			
 
				+$code.=<<___;
			
 
				+	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
			
 
				+	vmov	@x[4], @x[0]
			
 
				+	vmov	@x[6], @x[2]
			
 
				+	vmov	@x[5], @x[1]
			
 
				+	vmov	@x[7], @x[3]
			
 
				+___
			
 
				+	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
			
 
				+	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
			
 
				+}
			
 
				+
			
 
				+$code.=<<___;
			
 
				+.type	_bsaes_key_convert,%function
			
 
				+.align	4
			
 
				+_bsaes_key_convert:
			
 
				+	adr	$const,_bsaes_key_convert
			
 
				+	vld1.8	{@XMM[7]},  [$inp]!		@ load round 0 key
			
 
				+	sub	$const,$const,#_bsaes_key_convert-.LM0
			
 
				+	vld1.8	{@XMM[15]}, [$inp]!		@ load round 1 key
			
 
				+
			
 
				+	vmov.i8	@XMM[8],  #0x01			@ bit masks
			
 
				+	vmov.i8	@XMM[9],  #0x02
			
 
				+	vmov.i8	@XMM[10], #0x04
			
 
				+	vmov.i8	@XMM[11], #0x08
			
 
				+	vmov.i8	@XMM[12], #0x10
			
 
				+	vmov.i8	@XMM[13], #0x20
			
 
				+	vldmia	$const, {@XMM[14]}		@ .LM0
			
 
				+
			
 
				+#ifdef __ARMEL__
			
 
				+	vrev32.8	@XMM[7],  @XMM[7]
			
 
				+	vrev32.8	@XMM[15], @XMM[15]
			
 
				+#endif
			
 
				+	sub	$rounds,$rounds,#1
			
 
				+	vstmia	$out!, {@XMM[7]}		@ save round 0 key
			
 
				+	b	.Lkey_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lkey_loop:
			
 
				+	vtbl.8	`&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
			
 
				+	vtbl.8	`&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
			
 
				+	vmov.i8	@XMM[6],  #0x40
			
 
				+	vmov.i8	@XMM[15], #0x80
			
 
				+
			
 
				+	vtst.8	@XMM[0], @XMM[7], @XMM[8]
			
 
				+	vtst.8	@XMM[1], @XMM[7], @XMM[9]
			
 
				+	vtst.8	@XMM[2], @XMM[7], @XMM[10]
			
 
				+	vtst.8	@XMM[3], @XMM[7], @XMM[11]
			
 
				+	vtst.8	@XMM[4], @XMM[7], @XMM[12]
			
 
				+	vtst.8	@XMM[5], @XMM[7], @XMM[13]
			
 
				+	vtst.8	@XMM[6], @XMM[7], @XMM[6]
			
 
				+	vtst.8	@XMM[7], @XMM[7], @XMM[15]
			
 
				+	vld1.8	{@XMM[15]}, [$inp]!		@ load next round key
			
 
				+	vmvn	@XMM[0], @XMM[0]		@ "pnot"
			
 
				+	vmvn	@XMM[1], @XMM[1]
			
 
				+	vmvn	@XMM[5], @XMM[5]
			
 
				+	vmvn	@XMM[6], @XMM[6]
			
 
				+#ifdef __ARMEL__
			
 
				+	vrev32.8	@XMM[15], @XMM[15]
			
 
				+#endif
			
 
				+	subs	$rounds,$rounds,#1
			
 
				+	vstmia	$out!,{@XMM[0]-@XMM[7]}		@ write bit-sliced round key
			
 
				+	bne	.Lkey_loop
			
 
				+
			
 
				+	vmov.i8	@XMM[7],#0x63			@ compose .L63
			
 
				+	@ don't save last round key
			
 
				+	bx	lr
			
 
				+.size	_bsaes_key_convert,.-_bsaes_key_convert
			
 
				+___
			
 
				+}
			
 
				+
			
 
				+if (0) {		# following four functions are unsupported interface
			
 
				+			# used for benchmarking...
			
 
				+$code.=<<___;
			
 
				+.globl	bsaes_enc_key_convert
			
 
				+.type	bsaes_enc_key_convert,%function
			
 
				+.align	4
			
 
				+bsaes_enc_key_convert:
			
 
				+	stmdb	sp!,{r4-r6,lr}
			
 
				+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
			
 
				+
			
 
				+	ldr	r5,[$inp,#240]			@ pass rounds
			
 
				+	mov	r4,$inp				@ pass key
			
 
				+	mov	r12,$out			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
			
 
				+	vstmia	r12, {@XMM[7]}			@ save last round key
			
 
				+
			
 
				+	vldmia	sp!,{d8-d15}
			
 
				+	ldmia	sp!,{r4-r6,pc}
			
 
				+.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
			
 
				+
			
 
				+.globl	bsaes_encrypt_128
			
 
				+.type	bsaes_encrypt_128,%function
			
 
				+.align	4
			
 
				+bsaes_encrypt_128:
			
 
				+	stmdb	sp!,{r4-r6,lr}
			
 
				+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
			
 
				+.Lenc128_loop:
			
 
				+	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
			
 
				+	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
			
 
				+	mov	r4,$key				@ pass the key
			
 
				+	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
			
 
				+	mov	r5,#10				@ pass rounds
			
 
				+	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
			
 
				+
			
 
				+	bl	_bsaes_encrypt8
			
 
				+
			
 
				+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				+	vst1.8	{@XMM[4]}, [$out]!
			
 
				+	vst1.8	{@XMM[6]}, [$out]!
			
 
				+	vst1.8	{@XMM[3]}, [$out]!
			
 
				+	vst1.8	{@XMM[7]}, [$out]!
			
 
				+	vst1.8	{@XMM[2]}, [$out]!
			
 
				+	subs	$len,$len,#0x80
			
 
				+	vst1.8	{@XMM[5]}, [$out]!
			
 
				+	bhi	.Lenc128_loop
			
 
				+
			
 
				+	vldmia	sp!,{d8-d15}
			
 
				+	ldmia	sp!,{r4-r6,pc}
			
 
				+.size	bsaes_encrypt_128,.-bsaes_encrypt_128
			
 
				+
			
 
				+.globl	bsaes_dec_key_convert
			
 
				+.type	bsaes_dec_key_convert,%function
			
 
				+.align	4
			
 
				+bsaes_dec_key_convert:
			
 
				+	stmdb	sp!,{r4-r6,lr}
			
 
				+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
			
 
				+
			
 
				+	ldr	r5,[$inp,#240]			@ pass rounds
			
 
				+	mov	r4,$inp				@ pass key
			
 
				+	mov	r12,$out			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	vldmia	$out, {@XMM[6]}
			
 
				+	vstmia	r12,  {@XMM[15]}		@ save last round key
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
			
 
				+	vstmia	$out, {@XMM[7]}
			
 
				+
			
 
				+	vldmia	sp!,{d8-d15}
			
 
				+	ldmia	sp!,{r4-r6,pc}
			
 
				+.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
			
 
				+
			
 
				+.globl	bsaes_decrypt_128
			
 
				+.type	bsaes_decrypt_128,%function
			
 
				+.align	4
			
 
				+bsaes_decrypt_128:
			
 
				+	stmdb	sp!,{r4-r6,lr}
			
 
				+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
			
 
				+.Ldec128_loop:
			
 
				+	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
			
 
				+	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
			
 
				+	mov	r4,$key				@ pass the key
			
 
				+	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
			
 
				+	mov	r5,#10				@ pass rounds
			
 
				+	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
			
 
				+
			
 
				+	bl	_bsaes_decrypt8
			
 
				+
			
 
				+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				+	vst1.8	{@XMM[6]}, [$out]!
			
 
				+	vst1.8	{@XMM[4]}, [$out]!
			
 
				+	vst1.8	{@XMM[2]}, [$out]!
			
 
				+	vst1.8	{@XMM[7]}, [$out]!
			
 
				+	vst1.8	{@XMM[3]}, [$out]!
			
 
				+	subs	$len,$len,#0x80
			
 
				+	vst1.8	{@XMM[5]}, [$out]!
			
 
				+	bhi	.Ldec128_loop
			
 
				+
			
 
				+	vldmia	sp!,{d8-d15}
			
 
				+	ldmia	sp!,{r4-r6,pc}
			
 
				+.size	bsaes_decrypt_128,.-bsaes_decrypt_128
			
 
				+___
			
 
				+}
			
 
				+{
			
 
				+my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
			
 
				+my ($keysched)=("sp");
			
 
				+
			
 
				+$code.=<<___;
			
 
				+.extern AES_cbc_encrypt
			
 
				+.extern AES_decrypt
			
 
				+
			
 
				+.global	bsaes_cbc_encrypt
			
 
				+.type	bsaes_cbc_encrypt,%function
			
 
				+.align	5
			
 
				+bsaes_cbc_encrypt:
			
 
				+#ifndef	__KERNEL__
			
 
				+	cmp	$len, #128
			
 
				+#ifndef	__thumb__
			
 
				+	blo	AES_cbc_encrypt
			
 
				+#else
			
 
				+	bhs	1f
			
 
				+	b	AES_cbc_encrypt
			
 
				+1:
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+	@ it is up to the caller to make sure we are called with enc == 0
			
 
				+
			
 
				+	mov	ip, sp
			
 
				+	stmdb	sp!, {r4-r10, lr}
			
 
				+	VFP_ABI_PUSH
			
 
				+	ldr	$ivp, [ip]			@ IV is 1st arg on the stack
			
 
				+	mov	$len, $len, lsr#4		@ len in 16 byte blocks
			
 
				+	sub	sp, #0x10			@ scratch space to carry over the IV
			
 
				+	mov	$fp, sp				@ save sp
			
 
				+
			
 
				+	ldr	$rounds, [$key, #240]		@ get # of rounds
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	@ allocate the key schedule on the stack
			
 
				+	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
			
 
				+	add	r12, #`128-32`			@ sifze of bit-slices key schedule
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	mov	r4, $key			@ pass key
			
 
				+	mov	r5, $rounds			@ pass # of rounds
			
 
				+	mov	sp, r12				@ sp is $keysched
			
 
				+	bl	_bsaes_key_convert
			
 
				+	vldmia	$keysched, {@XMM[6]}
			
 
				+	vstmia	r12,  {@XMM[15]}		@ save last round key
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
			
 
				+	vstmia	$keysched, {@XMM[7]}
			
 
				+#else
			
 
				+	ldr	r12, [$key, #244]
			
 
				+	eors	r12, #1
			
 
				+	beq	0f
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	str	r12, [$key, #244]
			
 
				+	mov	r4, $key			@ pass key
			
 
				+	mov	r5, $rounds			@ pass # of rounds
			
 
				+	add	r12, $key, #248			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	add	r4, $key, #248
			
 
				+	vldmia	r4, {@XMM[6]}
			
 
				+	vstmia	r12, {@XMM[15]}			@ save last round key
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
			
 
				+	vstmia	r4, {@XMM[7]}
			
 
				+
			
 
				+.align	2
			
 
				+0:
			
 
				+#endif
			
 
				+
			
 
				+	vld1.8	{@XMM[15]}, [$ivp]		@ load IV
			
 
				+	b	.Lcbc_dec_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lcbc_dec_loop:
			
 
				+	subs	$len, $len, #0x8
			
 
				+	bmi	.Lcbc_dec_loop_finish
			
 
				+
			
 
				+	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
			
 
				+	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	mov	r4, $keysched			@ pass the key
			
 
				+#else
			
 
				+	add	r4, $key, #248
			
 
				+#endif
			
 
				+	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
			
 
				+	mov	r5, $rounds
			
 
				+	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]
			
 
				+	sub	$inp, $inp, #0x60
			
 
				+	vstmia	$fp, {@XMM[15]}			@ put aside IV
			
 
				+
			
 
				+	bl	_bsaes_decrypt8
			
 
				+
			
 
				+	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
			
 
				+	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				+	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				+	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
			
 
				+	veor	@XMM[4], @XMM[4], @XMM[10]
			
 
				+	veor	@XMM[2], @XMM[2], @XMM[11]
			
 
				+	vld1.8	{@XMM[14]-@XMM[15]}, [$inp]!
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[12]
			
 
				+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				+	veor	@XMM[3], @XMM[3], @XMM[13]
			
 
				+	vst1.8	{@XMM[6]}, [$out]!
			
 
				+	veor	@XMM[5], @XMM[5], @XMM[14]
			
 
				+	vst1.8	{@XMM[4]}, [$out]!
			
 
				+	vst1.8	{@XMM[2]}, [$out]!
			
 
				+	vst1.8	{@XMM[7]}, [$out]!
			
 
				+	vst1.8	{@XMM[3]}, [$out]!
			
 
				+	vst1.8	{@XMM[5]}, [$out]!
			
 
				+
			
 
				+	b	.Lcbc_dec_loop
			
 
				+
			
 
				+.Lcbc_dec_loop_finish:
			
 
				+	adds	$len, $len, #8
			
 
				+	beq	.Lcbc_dec_done
			
 
				+
			
 
				+	vld1.8	{@XMM[0]}, [$inp]!		@ load input
			
 
				+	cmp	$len, #2
			
 
				+	blo	.Lcbc_dec_one
			
 
				+	vld1.8	{@XMM[1]}, [$inp]!
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	mov	r4, $keysched			@ pass the key
			
 
				+#else
			
 
				+	add	r4, $key, #248
			
 
				+#endif
			
 
				+	mov	r5, $rounds
			
 
				+	vstmia	$fp, {@XMM[15]}			@ put aside IV
			
 
				+	beq	.Lcbc_dec_two
			
 
				+	vld1.8	{@XMM[2]}, [$inp]!
			
 
				+	cmp	$len, #4
			
 
				+	blo	.Lcbc_dec_three
			
 
				+	vld1.8	{@XMM[3]}, [$inp]!
			
 
				+	beq	.Lcbc_dec_four
			
 
				+	vld1.8	{@XMM[4]}, [$inp]!
			
 
				+	cmp	$len, #6
			
 
				+	blo	.Lcbc_dec_five
			
 
				+	vld1.8	{@XMM[5]}, [$inp]!
			
 
				+	beq	.Lcbc_dec_six
			
 
				+	vld1.8	{@XMM[6]}, [$inp]!
			
 
				+	sub	$inp, $inp, #0x70
			
 
				+
			
 
				+	bl	_bsaes_decrypt8
			
 
				+
			
 
				+	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
			
 
				+	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				+	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				+	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
			
 
				+	veor	@XMM[4], @XMM[4], @XMM[10]
			
 
				+	veor	@XMM[2], @XMM[2], @XMM[11]
			
 
				+	vld1.8	{@XMM[15]}, [$inp]!
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[12]
			
 
				+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				+	veor	@XMM[3], @XMM[3], @XMM[13]
			
 
				+	vst1.8	{@XMM[6]}, [$out]!
			
 
				+	vst1.8	{@XMM[4]}, [$out]!
			
 
				+	vst1.8	{@XMM[2]}, [$out]!
			
 
				+	vst1.8	{@XMM[7]}, [$out]!
			
 
				+	vst1.8	{@XMM[3]}, [$out]!
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_six:
			
 
				+	sub	$inp, $inp, #0x60
			
 
				+	bl	_bsaes_decrypt8
			
 
				+	vldmia	$fp,{@XMM[14]}			@ reload IV
			
 
				+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
			
 
				+	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				+	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				+	vld1.8	{@XMM[12]}, [$inp]!
			
 
				+	veor	@XMM[4], @XMM[4], @XMM[10]
			
 
				+	veor	@XMM[2], @XMM[2], @XMM[11]
			
 
				+	vld1.8	{@XMM[15]}, [$inp]!
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[12]
			
 
				+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				+	vst1.8	{@XMM[6]}, [$out]!
			
 
				+	vst1.8	{@XMM[4]}, [$out]!
			
 
				+	vst1.8	{@XMM[2]}, [$out]!
			
 
				+	vst1.8	{@XMM[7]}, [$out]!
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_five:
			
 
				+	sub	$inp, $inp, #0x50
			
 
				+	bl	_bsaes_decrypt8
			
 
				+	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				+	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
			
 
				+	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				+	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				+	vld1.8	{@XMM[15]}, [$inp]!
			
 
				+	veor	@XMM[4], @XMM[4], @XMM[10]
			
 
				+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				+	veor	@XMM[2], @XMM[2], @XMM[11]
			
 
				+	vst1.8	{@XMM[6]}, [$out]!
			
 
				+	vst1.8	{@XMM[4]}, [$out]!
			
 
				+	vst1.8	{@XMM[2]}, [$out]!
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_four:
			
 
				+	sub	$inp, $inp, #0x40
			
 
				+	bl	_bsaes_decrypt8
			
 
				+	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				+	vld1.8	{@XMM[10]}, [$inp]!
			
 
				+	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				+	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				+	vld1.8	{@XMM[15]}, [$inp]!
			
 
				+	veor	@XMM[4], @XMM[4], @XMM[10]
			
 
				+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				+	vst1.8	{@XMM[6]}, [$out]!
			
 
				+	vst1.8	{@XMM[4]}, [$out]!
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_three:
			
 
				+	sub	$inp, $inp, #0x30
			
 
				+	bl	_bsaes_decrypt8
			
 
				+	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				+	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				+	vld1.8	{@XMM[15]}, [$inp]!
			
 
				+	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				+	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				+	vst1.8	{@XMM[6]}, [$out]!
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_two:
			
 
				+	sub	$inp, $inp, #0x20
			
 
				+	bl	_bsaes_decrypt8
			
 
				+	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				+	vld1.8	{@XMM[8]}, [$inp]!		@ reload input
			
 
				+	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				+	vld1.8	{@XMM[15]}, [$inp]!		@ reload input
			
 
				+	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				+	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				+	b	.Lcbc_dec_done
			
 
				+.align	4
			
 
				+.Lcbc_dec_one:
			
 
				+	sub	$inp, $inp, #0x10
			
 
				+	mov	$rounds, $out			@ save original out pointer
			
 
				+	mov	$out, $fp			@ use the iv scratch space as out buffer
			
 
				+	mov	r2, $key
			
 
				+	vmov	@XMM[4],@XMM[15]		@ just in case ensure that IV
			
 
				+	vmov	@XMM[5],@XMM[0]			@ and input are preserved
			
 
				+	bl	AES_decrypt
			
 
				+	vld1.8	{@XMM[0]}, [$fp,:64]		@ load result
			
 
				+	veor	@XMM[0], @XMM[0], @XMM[4]	@ ^= IV
			
 
				+	vmov	@XMM[15], @XMM[5]		@ @XMM[5] holds input
			
 
				+	vst1.8	{@XMM[0]}, [$rounds]		@ write output
			
 
				+
			
 
				+.Lcbc_dec_done:
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	vmov.i32	q0, #0
			
 
				+	vmov.i32	q1, #0
			
 
				+.Lcbc_dec_bzero:				@ wipe key schedule [if any]
			
 
				+	vstmia		$keysched!, {q0-q1}
			
 
				+	cmp		$keysched, $fp
			
 
				+	bne		.Lcbc_dec_bzero
			
 
				+#endif
			
 
				+
			
 
				+	mov	sp, $fp
			
 
				+	add	sp, #0x10			@ add sp,$fp,#0x10 is no good for thumb
			
 
				+	vst1.8	{@XMM[15]}, [$ivp]		@ return IV
			
 
				+	VFP_ABI_POP
			
 
				+	ldmia	sp!, {r4-r10, pc}
			
 
				+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
			
 
				+___
			
 
				+}
			
 
				+{
			
 
				+my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
			
 
				+my $const = "r6";	# shared with _bsaes_encrypt8_alt
			
 
				+my $keysched = "sp";
			
 
				+
			
 
				+$code.=<<___;
			
 
				+.extern	AES_encrypt
			
 
				+.global	bsaes_ctr32_encrypt_blocks
			
 
				+.type	bsaes_ctr32_encrypt_blocks,%function
			
 
				+.align	5
			
 
				+bsaes_ctr32_encrypt_blocks:
			
 
				+	cmp	$len, #8			@ use plain AES for
			
 
				+	blo	.Lctr_enc_short			@ small sizes
			
 
				+
			
 
				+	mov	ip, sp
			
 
				+	stmdb	sp!, {r4-r10, lr}
			
 
				+	VFP_ABI_PUSH
			
 
				+	ldr	$ctr, [ip]			@ ctr is 1st arg on the stack
			
 
				+	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
			
 
				+	mov	$fp, sp				@ save sp
			
 
				+
			
 
				+	ldr	$rounds, [$key, #240]		@ get # of rounds
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	@ allocate the key schedule on the stack
			
 
				+	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
			
 
				+	add	r12, #`128-32`			@ size of bit-sliced key schedule
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	mov	r4, $key			@ pass key
			
 
				+	mov	r5, $rounds			@ pass # of rounds
			
 
				+	mov	sp, r12				@ sp is $keysched
			
 
				+	bl	_bsaes_key_convert
			
 
				+	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
			
 
				+	vstmia	r12, {@XMM[7]}			@ save last round key
			
 
				+
			
 
				+	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
			
 
				+	add	$ctr, $const, #.LREVM0SR-.LM0	@ borrow $ctr
			
 
				+	vldmia	$keysched, {@XMM[4]}		@ load round0 key
			
 
				+#else
			
 
				+	ldr	r12, [$key, #244]
			
 
				+	eors	r12, #1
			
 
				+	beq	0f
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	str	r12, [$key, #244]
			
 
				+	mov	r4, $key			@ pass key
			
 
				+	mov	r5, $rounds			@ pass # of rounds
			
 
				+	add	r12, $key, #248			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
			
 
				+	vstmia	r12, {@XMM[7]}			@ save last round key
			
 
				+
			
 
				+.align	2
			
 
				+0:	add	r12, $key, #248
			
 
				+	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
			
 
				+	adrl	$ctr, .LREVM0SR			@ borrow $ctr
			
 
				+	vldmia	r12, {@XMM[4]}			@ load round0 key
			
 
				+	sub	sp, #0x10			@ place for adjusted round0 key
			
 
				+#endif
			
 
				+
			
 
				+	vmov.i32	@XMM[8],#1		@ compose 1<<96
			
 
				+	veor		@XMM[9],@XMM[9],@XMM[9]
			
 
				+	vrev32.8	@XMM[0],@XMM[0]
			
 
				+	vext.8		@XMM[8],@XMM[9],@XMM[8],#4
			
 
				+	vrev32.8	@XMM[4],@XMM[4]
			
 
				+	vadd.u32	@XMM[9],@XMM[8],@XMM[8]	@ compose 2<<96
			
 
				+	vstmia	$keysched, {@XMM[4]}		@ save adjusted round0 key
			
 
				+	b	.Lctr_enc_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lctr_enc_loop:
			
 
				+	vadd.u32	@XMM[10], @XMM[8], @XMM[9]	@ compose 3<<96
			
 
				+	vadd.u32	@XMM[1], @XMM[0], @XMM[8]	@ +1
			
 
				+	vadd.u32	@XMM[2], @XMM[0], @XMM[9]	@ +2
			
 
				+	vadd.u32	@XMM[3], @XMM[0], @XMM[10]	@ +3
			
 
				+	vadd.u32	@XMM[4], @XMM[1], @XMM[10]
			
 
				+	vadd.u32	@XMM[5], @XMM[2], @XMM[10]
			
 
				+	vadd.u32	@XMM[6], @XMM[3], @XMM[10]
			
 
				+	vadd.u32	@XMM[7], @XMM[4], @XMM[10]
			
 
				+	vadd.u32	@XMM[10], @XMM[5], @XMM[10]	@ next counter
			
 
				+
			
 
				+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
			
 
				+	@ to flip byte order in 32-bit counter
			
 
				+
			
 
				+	vldmia		$keysched, {@XMM[9]}		@ load round0 key
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, $keysched, #0x10		@ pass next round key
			
 
				+#else
			
 
				+	add		r4, $key, #`248+16`
			
 
				+#endif
			
 
				+	vldmia		$ctr, {@XMM[8]}			@ .LREVM0SR
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	vstmia		$fp, {@XMM[10]}			@ save next counter
			
 
				+	sub		$const, $ctr, #.LREVM0SR-.LSR	@ pass constants
			
 
				+
			
 
				+	bl		_bsaes_encrypt8_alt
			
 
				+
			
 
				+	subs		$len, $len, #8
			
 
				+	blo		.Lctr_enc_loop_done
			
 
				+
			
 
				+	vld1.8		{@XMM[8]-@XMM[9]}, [$inp]!	@ load input
			
 
				+	vld1.8		{@XMM[10]-@XMM[11]}, [$inp]!
			
 
				+	veor		@XMM[0], @XMM[8]
			
 
				+	veor		@XMM[1], @XMM[9]
			
 
				+	vld1.8		{@XMM[12]-@XMM[13]}, [$inp]!
			
 
				+	veor		@XMM[4], @XMM[10]
			
 
				+	veor		@XMM[6], @XMM[11]
			
 
				+	vld1.8		{@XMM[14]-@XMM[15]}, [$inp]!
			
 
				+	veor		@XMM[3], @XMM[12]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				+	veor		@XMM[7], @XMM[13]
			
 
				+	veor		@XMM[2], @XMM[14]
			
 
				+	vst1.8		{@XMM[4]}, [$out]!
			
 
				+	veor		@XMM[5], @XMM[15]
			
 
				+	vst1.8		{@XMM[6]}, [$out]!
			
 
				+	vmov.i32	@XMM[8], #1			@ compose 1<<96
			
 
				+	vst1.8		{@XMM[3]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[9], @XMM[9]
			
 
				+	vst1.8		{@XMM[7]}, [$out]!
			
 
				+	vext.8		@XMM[8], @XMM[9], @XMM[8], #4
			
 
				+	vst1.8		{@XMM[2]}, [$out]!
			
 
				+	vadd.u32	@XMM[9],@XMM[8],@XMM[8]		@ compose 2<<96
			
 
				+	vst1.8		{@XMM[5]}, [$out]!
			
 
				+	vldmia		$fp, {@XMM[0]}			@ load counter
			
 
				+
			
 
				+	bne		.Lctr_enc_loop
			
 
				+	b		.Lctr_enc_done
			
 
				+
			
 
				+.align	4
			
 
				+.Lctr_enc_loop_done:
			
 
				+	add		$len, $len, #8
			
 
				+	vld1.8		{@XMM[8]}, [$inp]!	@ load input
			
 
				+	veor		@XMM[0], @XMM[8]
			
 
				+	vst1.8		{@XMM[0]}, [$out]!	@ write output
			
 
				+	cmp		$len, #2
			
 
				+	blo		.Lctr_enc_done
			
 
				+	vld1.8		{@XMM[9]}, [$inp]!
			
 
				+	veor		@XMM[1], @XMM[9]
			
 
				+	vst1.8		{@XMM[1]}, [$out]!
			
 
				+	beq		.Lctr_enc_done
			
 
				+	vld1.8		{@XMM[10]}, [$inp]!
			
 
				+	veor		@XMM[4], @XMM[10]
			
 
				+	vst1.8		{@XMM[4]}, [$out]!
			
 
				+	cmp		$len, #4
			
 
				+	blo		.Lctr_enc_done
			
 
				+	vld1.8		{@XMM[11]}, [$inp]!
			
 
				+	veor		@XMM[6], @XMM[11]
			
 
				+	vst1.8		{@XMM[6]}, [$out]!
			
 
				+	beq		.Lctr_enc_done
			
 
				+	vld1.8		{@XMM[12]}, [$inp]!
			
 
				+	veor		@XMM[3], @XMM[12]
			
 
				+	vst1.8		{@XMM[3]}, [$out]!
			
 
				+	cmp		$len, #6
			
 
				+	blo		.Lctr_enc_done
			
 
				+	vld1.8		{@XMM[13]}, [$inp]!
			
 
				+	veor		@XMM[7], @XMM[13]
			
 
				+	vst1.8		{@XMM[7]}, [$out]!
			
 
				+	beq		.Lctr_enc_done
			
 
				+	vld1.8		{@XMM[14]}, [$inp]
			
 
				+	veor		@XMM[2], @XMM[14]
			
 
				+	vst1.8		{@XMM[2]}, [$out]!
			
 
				+
			
 
				+.Lctr_enc_done:
			
 
				+	vmov.i32	q0, #0
			
 
				+	vmov.i32	q1, #0
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+.Lctr_enc_bzero:			@ wipe key schedule [if any]
			
 
				+	vstmia		$keysched!, {q0-q1}
			
 
				+	cmp		$keysched, $fp
			
 
				+	bne		.Lctr_enc_bzero
			
 
				+#else
			
 
				+	vstmia		$keysched, {q0-q1}
			
 
				+#endif
			
 
				+
			
 
				+	mov	sp, $fp
			
 
				+	add	sp, #0x10		@ add sp,$fp,#0x10 is no good for thumb
			
 
				+	VFP_ABI_POP
			
 
				+	ldmia	sp!, {r4-r10, pc}	@ return
			
 
				+
			
 
				+.align	4
			
 
				+.Lctr_enc_short:
			
 
				+	ldr	ip, [sp]		@ ctr pointer is passed on stack
			
 
				+	stmdb	sp!, {r4-r8, lr}
			
 
				+
			
 
				+	mov	r4, $inp		@ copy arguments
			
 
				+	mov	r5, $out
			
 
				+	mov	r6, $len
			
 
				+	mov	r7, $key
			
 
				+	ldr	r8, [ip, #12]		@ load counter LSW
			
 
				+	vld1.8	{@XMM[1]}, [ip]		@ load whole counter value
			
 
				+#ifdef __ARMEL__
			
 
				+	rev	r8, r8
			
 
				+#endif
			
 
				+	sub	sp, sp, #0x10
			
 
				+	vst1.8	{@XMM[1]}, [sp,:64]	@ copy counter value
			
 
				+	sub	sp, sp, #0x10
			
 
				+
			
 
				+.Lctr_enc_short_loop:
			
 
				+	add	r0, sp, #0x10		@ input counter value
			
 
				+	mov	r1, sp			@ output on the stack
			
 
				+	mov	r2, r7			@ key
			
 
				+
			
 
				+	bl	AES_encrypt
			
 
				+
			
 
				+	vld1.8	{@XMM[0]}, [r4]!	@ load input
			
 
				+	vld1.8	{@XMM[1]}, [sp,:64]	@ load encrypted counter
			
 
				+	add	r8, r8, #1
			
 
				+#ifdef __ARMEL__
			
 
				+	rev	r0, r8
			
 
				+	str	r0, [sp, #0x1c]		@ next counter value
			
 
				+#else
			
 
				+	str	r8, [sp, #0x1c]		@ next counter value
			
 
				+#endif
			
 
				+	veor	@XMM[0],@XMM[0],@XMM[1]
			
 
				+	vst1.8	{@XMM[0]}, [r5]!	@ store output
			
 
				+	subs	r6, r6, #1
			
 
				+	bne	.Lctr_enc_short_loop
			
 
				+
			
 
				+	vmov.i32	q0, #0
			
 
				+	vmov.i32	q1, #0
			
 
				+	vstmia		sp!, {q0-q1}
			
 
				+
			
 
				+	ldmia	sp!, {r4-r8, pc}
			
 
				+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
			
 
				+___
			
 
				+}
			
 
				+{
			
 
				+######################################################################
			
 
				+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
			
 
				+#	const AES_KEY *key1, const AES_KEY *key2,
			
 
				+#	const unsigned char iv[16]);
			
 
				+#
			
 
				+my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
			
 
				+my $const="r6";		# returned by _bsaes_key_convert
			
 
				+my $twmask=@XMM[5];
			
 
				+my @T=@XMM[6..7];
			
 
				+
			
 
				+$code.=<<___;
			
 
				+.globl	bsaes_xts_encrypt
			
 
				+.type	bsaes_xts_encrypt,%function
			
 
				+.align	4
			
 
				+bsaes_xts_encrypt:
			
 
				+	mov	ip, sp
			
 
				+	stmdb	sp!, {r4-r10, lr}		@ 0x20
			
 
				+	VFP_ABI_PUSH
			
 
				+	mov	r6, sp				@ future $fp
			
 
				+
			
 
				+	mov	$inp, r0
			
 
				+	mov	$out, r1
			
 
				+	mov	$len, r2
			
 
				+	mov	$key, r3
			
 
				+
			
 
				+	sub	r0, sp, #0x10			@ 0x10
			
 
				+	bic	r0, #0xf			@ align at 16 bytes
			
 
				+	mov	sp, r0
			
 
				+
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	ldr	r0, [ip]			@ pointer to input tweak
			
 
				+#else
			
 
				+	@ generate initial tweak
			
 
				+	ldr	r0, [ip, #4]			@ iv[]
			
 
				+	mov	r1, sp
			
 
				+	ldr	r2, [ip, #0]			@ key2
			
 
				+	bl	AES_encrypt
			
 
				+	mov	r0,sp				@ pointer to initial tweak
			
 
				+#endif
			
 
				+
			
 
				+	ldr	$rounds, [$key, #240]		@ get # of rounds
			
 
				+	mov	$fp, r6
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	@ allocate the key schedule on the stack
			
 
				+	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
			
 
				+	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
			
 
				+	sub	r12, #`32+16`			@ place for tweak[9]
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	mov	r4, $key			@ pass key
			
 
				+	mov	r5, $rounds			@ pass # of rounds
			
 
				+	mov	sp, r12
			
 
				+	add	r12, #0x90			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
			
 
				+	vstmia	r12, {@XMM[7]}			@ save last round key
			
 
				+#else
			
 
				+	ldr	r12, [$key, #244]
			
 
				+	eors	r12, #1
			
 
				+	beq	0f
			
 
				+
			
 
				+	str	r12, [$key, #244]
			
 
				+	mov	r4, $key			@ pass key
			
 
				+	mov	r5, $rounds			@ pass # of rounds
			
 
				+	add	r12, $key, #248			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
			
 
				+	vstmia	r12, {@XMM[7]}
			
 
				+
			
 
				+.align	2
			
 
				+0:	sub	sp, #0x90			@ place for tweak[9]
			
 
				+#endif
			
 
				+
			
 
				+	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
			
 
				+	adr	$magic, .Lxts_magic
			
 
				+
			
 
				+	subs	$len, #0x80
			
 
				+	blo	.Lxts_enc_short
			
 
				+	b	.Lxts_enc_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lxts_enc_loop:
			
 
				+	vldmia		$magic, {$twmask}	@ load XTS magic
			
 
				+	vshr.s64	@T[0], @XMM[8], #63
			
 
				+	mov		r0, sp
			
 
				+	vand		@T[0], @T[0], $twmask
			
 
				+___
			
 
				+for($i=9;$i<16;$i++) {
			
 
				+$code.=<<___;
			
 
				+	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
			
 
				+	vst1.64		{@XMM[$i-1]}, [r0,:128]!
			
 
				+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				+	vshr.s64	@T[1], @XMM[$i], #63
			
 
				+	veor		@XMM[$i], @XMM[$i], @T[0]
			
 
				+	vand		@T[1], @T[1], $twmask
			
 
				+___
			
 
				+	@T=reverse(@T);
			
 
				+
			
 
				+$code.=<<___ if ($i>=10);
			
 
				+	vld1.8		{@XMM[$i-10]}, [$inp]!
			
 
				+___
			
 
				+$code.=<<___ if ($i>=11);
			
 
				+	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
			
 
				+___
			
 
				+}
			
 
				+$code.=<<___;
			
 
				+	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
			
 
				+	vst1.64		{@XMM[15]}, [r0,:128]!
			
 
				+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				+	veor		@XMM[8], @XMM[8], @T[0]
			
 
				+	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
			
 
				+	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[6], @XMM[6], @XMM[14]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	veor		@XMM[7], @XMM[7], @XMM[15]
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[6], @XMM[11]
			
 
				+	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
			
 
				+	veor		@XMM[10], @XMM[3], @XMM[12]
			
 
				+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				+	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				+	veor		@XMM[12], @XMM[2], @XMM[14]
			
 
				+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				+	veor		@XMM[13], @XMM[5], @XMM[15]
			
 
				+	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	subs		$len, #0x80
			
 
				+	bpl		.Lxts_enc_loop
			
 
				+
			
 
				+.Lxts_enc_short:
			
 
				+	adds		$len, #0x70
			
 
				+	bmi		.Lxts_enc_done
			
 
				+
			
 
				+	vldmia		$magic, {$twmask}	@ load XTS magic
			
 
				+	vshr.s64	@T[0], @XMM[8], #63
			
 
				+	mov		r0, sp
			
 
				+	vand		@T[0], @T[0], $twmask
			
 
				+___
			
 
				+for($i=9;$i<16;$i++) {
			
 
				+$code.=<<___;
			
 
				+	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
			
 
				+	vst1.64		{@XMM[$i-1]}, [r0,:128]!
			
 
				+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				+	vshr.s64	@T[1], @XMM[$i], #63
			
 
				+	veor		@XMM[$i], @XMM[$i], @T[0]
			
 
				+	vand		@T[1], @T[1], $twmask
			
 
				+___
			
 
				+	@T=reverse(@T);
			
 
				+
			
 
				+$code.=<<___ if ($i>=10);
			
 
				+	vld1.8		{@XMM[$i-10]}, [$inp]!
			
 
				+	subs		$len, #0x10
			
 
				+	bmi		.Lxts_enc_`$i-9`
			
 
				+___
			
 
				+$code.=<<___ if ($i>=11);
			
 
				+	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
			
 
				+___
			
 
				+}
			
 
				+$code.=<<___;
			
 
				+	sub		$len, #0x10
			
 
				+	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	vld1.8		{@XMM[6]}, [$inp]!
			
 
				+	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[6], @XMM[6], @XMM[14]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[6], @XMM[11]
			
 
				+	vld1.64		{@XMM[14]}, [r0,:128]!
			
 
				+	veor		@XMM[10], @XMM[3], @XMM[12]
			
 
				+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				+	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				+	veor		@XMM[12], @XMM[2], @XMM[14]
			
 
				+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				+	vst1.8		{@XMM[12]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+.align	4
			
 
				+.Lxts_enc_6:
			
 
				+	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		@XMM[4], @XMM[4], @XMM[12]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[6], @XMM[11]
			
 
				+	veor		@XMM[10], @XMM[3], @XMM[12]
			
 
				+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				+	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+
			
 
				+@ put this in range for both ARM and Thumb mode adr instructions
			
 
				+.align	5
			
 
				+.Lxts_magic:
			
 
				+	.quad	1, 0x87
			
 
				+
			
 
				+.align	5
			
 
				+.Lxts_enc_5:
			
 
				+	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		@XMM[3], @XMM[3], @XMM[11]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[4], @XMM[4], @XMM[12]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	vld1.64		{@XMM[12]}, [r0,:128]!
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[6], @XMM[11]
			
 
				+	veor		@XMM[10], @XMM[3], @XMM[12]
			
 
				+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				+	vst1.8		{@XMM[10]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+.align	4
			
 
				+.Lxts_enc_4:
			
 
				+	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		@XMM[2], @XMM[2], @XMM[10]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[3], @XMM[3], @XMM[11]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[6], @XMM[11]
			
 
				+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+.align	4
			
 
				+.Lxts_enc_3:
			
 
				+	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[9]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[2], @XMM[2], @XMM[10]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	vst1.8		{@XMM[8]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+.align	4
			
 
				+.Lxts_enc_2:
			
 
				+	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[9]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_encrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_enc_done
			
 
				+.align	4
			
 
				+.Lxts_enc_1:
			
 
				+	mov		r0, sp
			
 
				+	veor		@XMM[0], @XMM[8]
			
 
				+	mov		r1, sp
			
 
				+	vst1.8		{@XMM[0]}, [sp,:128]
			
 
				+	mov		r2, $key
			
 
				+	mov		r4, $fp				@ preserve fp
			
 
				+
			
 
				+	bl		AES_encrypt
			
 
				+
			
 
				+	vld1.8		{@XMM[0]}, [sp,:128]
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				+	vst1.8		{@XMM[0]}, [$out]!
			
 
				+	mov		$fp, r4
			
 
				+
			
 
				+	vmov		@XMM[8], @XMM[9]		@ next round tweak
			
 
				+
			
 
				+.Lxts_enc_done:
			
 
				+#ifndef	XTS_CHAIN_TWEAK
			
 
				+	adds		$len, #0x10
			
 
				+	beq		.Lxts_enc_ret
			
 
				+	sub		r6, $out, #0x10
			
 
				+
			
 
				+.Lxts_enc_steal:
			
 
				+	ldrb		r0, [$inp], #1
			
 
				+	ldrb		r1, [$out, #-0x10]
			
 
				+	strb		r0, [$out, #-0x10]
			
 
				+	strb		r1, [$out], #1
			
 
				+
			
 
				+	subs		$len, #1
			
 
				+	bhi		.Lxts_enc_steal
			
 
				+
			
 
				+	vld1.8		{@XMM[0]}, [r6]
			
 
				+	mov		r0, sp
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				+	mov		r1, sp
			
 
				+	vst1.8		{@XMM[0]}, [sp,:128]
			
 
				+	mov		r2, $key
			
 
				+	mov		r4, $fp			@ preserve fp
			
 
				+
			
 
				+	bl		AES_encrypt
			
 
				+
			
 
				+	vld1.8		{@XMM[0]}, [sp,:128]
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				+	vst1.8		{@XMM[0]}, [r6]
			
 
				+	mov		$fp, r4
			
 
				+#endif
			
 
				+
			
 
				+.Lxts_enc_ret:
			
 
				+	bic		r0, $fp, #0xf
			
 
				+	vmov.i32	q0, #0
			
 
				+	vmov.i32	q1, #0
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
			
 
				+#endif
			
 
				+.Lxts_enc_bzero:				@ wipe key schedule [if any]
			
 
				+	vstmia		sp!, {q0-q1}
			
 
				+	cmp		sp, r0
			
 
				+	bne		.Lxts_enc_bzero
			
 
				+
			
 
				+	mov		sp, $fp
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	vst1.8		{@XMM[8]}, [r1]
			
 
				+#endif
			
 
				+	VFP_ABI_POP
			
 
				+	ldmia		sp!, {r4-r10, pc}	@ return
			
 
				+
			
 
				+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
			
 
				+
			
 
				+.globl	bsaes_xts_decrypt
			
 
				+.type	bsaes_xts_decrypt,%function
			
 
				+.align	4
			
 
				+bsaes_xts_decrypt:
			
 
				+	mov	ip, sp
			
 
				+	stmdb	sp!, {r4-r10, lr}		@ 0x20
			
 
				+	VFP_ABI_PUSH
			
 
				+	mov	r6, sp				@ future $fp
			
 
				+
			
 
				+	mov	$inp, r0
			
 
				+	mov	$out, r1
			
 
				+	mov	$len, r2
			
 
				+	mov	$key, r3
			
 
				+
			
 
				+	sub	r0, sp, #0x10			@ 0x10
			
 
				+	bic	r0, #0xf			@ align at 16 bytes
			
 
				+	mov	sp, r0
			
 
				+
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	ldr	r0, [ip]			@ pointer to input tweak
			
 
				+#else
			
 
				+	@ generate initial tweak
			
 
				+	ldr	r0, [ip, #4]			@ iv[]
			
 
				+	mov	r1, sp
			
 
				+	ldr	r2, [ip, #0]			@ key2
			
 
				+	bl	AES_encrypt
			
 
				+	mov	r0, sp				@ pointer to initial tweak
			
 
				+#endif
			
 
				+
			
 
				+	ldr	$rounds, [$key, #240]		@ get # of rounds
			
 
				+	mov	$fp, r6
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	@ allocate the key schedule on the stack
			
 
				+	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
			
 
				+	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
			
 
				+	sub	r12, #`32+16`			@ place for tweak[9]
			
 
				+
			
 
				+	@ populate the key schedule
			
 
				+	mov	r4, $key			@ pass key
			
 
				+	mov	r5, $rounds			@ pass # of rounds
			
 
				+	mov	sp, r12
			
 
				+	add	r12, #0x90			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	add	r4, sp, #0x90
			
 
				+	vldmia	r4, {@XMM[6]}
			
 
				+	vstmia	r12,  {@XMM[15]}		@ save last round key
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
			
 
				+	vstmia	r4, {@XMM[7]}
			
 
				+#else
			
 
				+	ldr	r12, [$key, #244]
			
 
				+	eors	r12, #1
			
 
				+	beq	0f
			
 
				+
			
 
				+	str	r12, [$key, #244]
			
 
				+	mov	r4, $key			@ pass key
			
 
				+	mov	r5, $rounds			@ pass # of rounds
			
 
				+	add	r12, $key, #248			@ pass key schedule
			
 
				+	bl	_bsaes_key_convert
			
 
				+	add	r4, $key, #248
			
 
				+	vldmia	r4, {@XMM[6]}
			
 
				+	vstmia	r12,  {@XMM[15]}		@ save last round key
			
 
				+	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
			
 
				+	vstmia	r4, {@XMM[7]}
			
 
				+
			
 
				+.align	2
			
 
				+0:	sub	sp, #0x90			@ place for tweak[9]
			
 
				+#endif
			
 
				+	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
			
 
				+	adr	$magic, .Lxts_magic
			
 
				+
			
 
				+	tst	$len, #0xf			@ if not multiple of 16
			
 
				+	it	ne				@ Thumb2 thing, sanity check in ARM
			
 
				+	subne	$len, #0x10			@ subtract another 16 bytes
			
 
				+	subs	$len, #0x80
			
 
				+
			
 
				+	blo	.Lxts_dec_short
			
 
				+	b	.Lxts_dec_loop
			
 
				+
			
 
				+.align	4
			
 
				+.Lxts_dec_loop:
			
 
				+	vldmia		$magic, {$twmask}	@ load XTS magic
			
 
				+	vshr.s64	@T[0], @XMM[8], #63
			
 
				+	mov		r0, sp
			
 
				+	vand		@T[0], @T[0], $twmask
			
 
				+___
			
 
				+for($i=9;$i<16;$i++) {
			
 
				+$code.=<<___;
			
 
				+	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
			
 
				+	vst1.64		{@XMM[$i-1]}, [r0,:128]!
			
 
				+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				+	vshr.s64	@T[1], @XMM[$i], #63
			
 
				+	veor		@XMM[$i], @XMM[$i], @T[0]
			
 
				+	vand		@T[1], @T[1], $twmask
			
 
				+___
			
 
				+	@T=reverse(@T);
			
 
				+
			
 
				+$code.=<<___ if ($i>=10);
			
 
				+	vld1.8		{@XMM[$i-10]}, [$inp]!
			
 
				+___
			
 
				+$code.=<<___ if ($i>=11);
			
 
				+	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
			
 
				+___
			
 
				+}
			
 
				+$code.=<<___;
			
 
				+	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
			
 
				+	vst1.64		{@XMM[15]}, [r0,:128]!
			
 
				+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				+	veor		@XMM[8], @XMM[8], @T[0]
			
 
				+	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
			
 
				+	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[6], @XMM[6], @XMM[14]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	veor		@XMM[7], @XMM[7], @XMM[15]
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[4], @XMM[11]
			
 
				+	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
			
 
				+	veor		@XMM[10], @XMM[2], @XMM[12]
			
 
				+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				+	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				+	veor		@XMM[12], @XMM[3], @XMM[14]
			
 
				+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				+	veor		@XMM[13], @XMM[5], @XMM[15]
			
 
				+	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	subs		$len, #0x80
			
 
				+	bpl		.Lxts_dec_loop
			
 
				+
			
 
				+.Lxts_dec_short:
			
 
				+	adds		$len, #0x70
			
 
				+	bmi		.Lxts_dec_done
			
 
				+
			
 
				+	vldmia		$magic, {$twmask}	@ load XTS magic
			
 
				+	vshr.s64	@T[0], @XMM[8], #63
			
 
				+	mov		r0, sp
			
 
				+	vand		@T[0], @T[0], $twmask
			
 
				+___
			
 
				+for($i=9;$i<16;$i++) {
			
 
				+$code.=<<___;
			
 
				+	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
			
 
				+	vst1.64		{@XMM[$i-1]}, [r0,:128]!
			
 
				+	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				+	vshr.s64	@T[1], @XMM[$i], #63
			
 
				+	veor		@XMM[$i], @XMM[$i], @T[0]
			
 
				+	vand		@T[1], @T[1], $twmask
			
 
				+___
			
 
				+	@T=reverse(@T);
			
 
				+
			
 
				+$code.=<<___ if ($i>=10);
			
 
				+	vld1.8		{@XMM[$i-10]}, [$inp]!
			
 
				+	subs		$len, #0x10
			
 
				+	bmi		.Lxts_dec_`$i-9`
			
 
				+___
			
 
				+$code.=<<___ if ($i>=11);
			
 
				+	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
			
 
				+___
			
 
				+}
			
 
				+$code.=<<___;
			
 
				+	sub		$len, #0x10
			
 
				+	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	vld1.8		{@XMM[6]}, [$inp]!
			
 
				+	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[6], @XMM[6], @XMM[14]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[4], @XMM[11]
			
 
				+	vld1.64		{@XMM[14]}, [r0,:128]!
			
 
				+	veor		@XMM[10], @XMM[2], @XMM[12]
			
 
				+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				+	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				+	veor		@XMM[12], @XMM[3], @XMM[14]
			
 
				+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				+	vst1.8		{@XMM[12]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_6:
			
 
				+	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		@XMM[4], @XMM[4], @XMM[12]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[4], @XMM[11]
			
 
				+	veor		@XMM[10], @XMM[2], @XMM[12]
			
 
				+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				+	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				+	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_5:
			
 
				+	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		@XMM[3], @XMM[3], @XMM[11]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[4], @XMM[4], @XMM[12]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	vld1.64		{@XMM[12]}, [r0,:128]!
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[4], @XMM[11]
			
 
				+	veor		@XMM[10], @XMM[2], @XMM[12]
			
 
				+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				+	vst1.8		{@XMM[10]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_4:
			
 
				+	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		@XMM[2], @XMM[2], @XMM[10]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[3], @XMM[3], @XMM[11]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	veor		@XMM[9], @XMM[4], @XMM[11]
			
 
				+	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_3:
			
 
				+	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[9]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[2], @XMM[2], @XMM[10]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
			
 
				+	vld1.64		{@XMM[10]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+	vst1.8		{@XMM[8]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_2:
			
 
				+	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
			
 
				+
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				+#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				+	add		r4, sp, #0x90			@ pass key schedule
			
 
				+#else
			
 
				+	add		r4, $key, #248			@ pass key schedule
			
 
				+#endif
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[9]
			
 
				+	mov		r5, $rounds			@ pass rounds
			
 
				+	mov		r0, sp
			
 
				+
			
 
				+	bl		_bsaes_decrypt8
			
 
				+
			
 
				+	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				+	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				+	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				+
			
 
				+	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				+	b		.Lxts_dec_done
			
 
				+.align	4
			
 
				+.Lxts_dec_1:
			
 
				+	mov		r0, sp
			
 
				+	veor		@XMM[0], @XMM[8]
			
 
				+	mov		r1, sp
			
 
				+	vst1.8		{@XMM[0]}, [sp,:128]
			
 
				+	mov		r2, $key
			
 
				+	mov		r4, $fp				@ preserve fp
			
 
				+	mov		r5, $magic			@ preserve magic
			
 
				+
			
 
				+	bl		AES_decrypt
			
 
				+
			
 
				+	vld1.8		{@XMM[0]}, [sp,:128]
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				+	vst1.8		{@XMM[0]}, [$out]!
			
 
				+	mov		$fp, r4
			
 
				+	mov		$magic, r5
			
 
				+
			
 
				+	vmov		@XMM[8], @XMM[9]		@ next round tweak
			
 
				+
			
 
				+.Lxts_dec_done:
			
 
				+#ifndef	XTS_CHAIN_TWEAK
			
 
				+	adds		$len, #0x10
			
 
				+	beq		.Lxts_dec_ret
			
 
				+
			
 
				+	@ calculate one round of extra tweak for the stolen ciphertext
			
 
				+	vldmia		$magic, {$twmask}
			
 
				+	vshr.s64	@XMM[6], @XMM[8], #63
			
 
				+	vand		@XMM[6], @XMM[6], $twmask
			
 
				+	vadd.u64	@XMM[9], @XMM[8], @XMM[8]
			
 
				+	vswp		`&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
			
 
				+	veor		@XMM[9], @XMM[9], @XMM[6]
			
 
				+
			
 
				+	@ perform the final decryption with the last tweak value
			
 
				+	vld1.8		{@XMM[0]}, [$inp]!
			
 
				+	mov		r0, sp
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[9]
			
 
				+	mov		r1, sp
			
 
				+	vst1.8		{@XMM[0]}, [sp,:128]
			
 
				+	mov		r2, $key
			
 
				+	mov		r4, $fp			@ preserve fp
			
 
				+
			
 
				+	bl		AES_decrypt
			
 
				+
			
 
				+	vld1.8		{@XMM[0]}, [sp,:128]
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[9]
			
 
				+	vst1.8		{@XMM[0]}, [$out]
			
 
				+
			
 
				+	mov		r6, $out
			
 
				+.Lxts_dec_steal:
			
 
				+	ldrb		r1, [$out]
			
 
				+	ldrb		r0, [$inp], #1
			
 
				+	strb		r1, [$out, #0x10]
			
 
				+	strb		r0, [$out], #1
			
 
				+
			
 
				+	subs		$len, #1
			
 
				+	bhi		.Lxts_dec_steal
			
 
				+
			
 
				+	vld1.8		{@XMM[0]}, [r6]
			
 
				+	mov		r0, sp
			
 
				+	veor		@XMM[0], @XMM[8]
			
 
				+	mov		r1, sp
			
 
				+	vst1.8		{@XMM[0]}, [sp,:128]
			
 
				+	mov		r2, $key
			
 
				+
			
 
				+	bl		AES_decrypt
			
 
				+
			
 
				+	vld1.8		{@XMM[0]}, [sp,:128]
			
 
				+	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				+	vst1.8		{@XMM[0]}, [r6]
			
 
				+	mov		$fp, r4
			
 
				+#endif
			
 
				+
			
 
				+.Lxts_dec_ret:
			
 
				+	bic		r0, $fp, #0xf
			
 
				+	vmov.i32	q0, #0
			
 
				+	vmov.i32	q1, #0
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
			
 
				+#endif
			
 
				+.Lxts_dec_bzero:				@ wipe key schedule [if any]
			
 
				+	vstmia		sp!, {q0-q1}
			
 
				+	cmp		sp, r0
			
 
				+	bne		.Lxts_dec_bzero
			
 
				+
			
 
				+	mov		sp, $fp
			
 
				+#ifdef	XTS_CHAIN_TWEAK
			
 
				+	vst1.8		{@XMM[8]}, [r1]
			
 
				+#endif
			
 
				+	VFP_ABI_POP
			
 
				+	ldmia		sp!, {r4-r10, pc}	@ return
			
 
				+
			
 
				+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
			
 
				+___
			
 
				+}
			
 
				+$code.=<<___;
			
 
				+#endif
			
 
				+___
			
 
				+
			
 
				+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
			
 
				+
			
 
				+open SELF,$0;
			
 
				+while(<SELF>) {
			
 
				+	next if (/^#!/);
			
 
				+        last if (!s/^#/@/ and !/^$/);
			
 
				+        print;
			
 
				+}
			
 
				+close SELF;
			
 
				+
			
 
				+print $code;
			
 
				+
			
 
				+close STDOUT;
			
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -24,6 +24,7 @@ generic-y += sembuf.h
 
				 generic-y += serial.h
			
 
				 generic-y += shmbuf.h
			
 
				 generic-y += siginfo.h
			
 
				+generic-y += simd.h
			
 
				 generic-y += sizes.h
			
 
				 generic-y += socket.h
			
 
				 generic-y += sockios.h
			
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -53,6 +53,13 @@
 
				 #define put_byte_3      lsl #0
			
 
				 #endif
			
 
				 
			
 
				+/* Select code for any configuration running in BE8 mode */
			
 
				+#ifdef CONFIG_CPU_ENDIAN_BE8
			
 
				+#define ARM_BE8(code...) code
			
 
				+#else
			
 
				+#define ARM_BE8(code...)
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * Data preload for architectures that support it
			
 
				  */
			
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -12,6 +12,7 @@
 
				 #define __ASM_ARM_ATOMIC_H
			
 
				 
			
 
				 #include <linux/compiler.h>
			
 
				+#include <linux/prefetch.h>
			
 
				 #include <linux/types.h>
			
 
				 #include <linux/irqflags.h>
			
 
				 #include <asm/barrier.h>
			
@@ -41,6 +42,7 @@ static inline void atomic_add(int i, atomic_t *v)
 
				 	unsigned long tmp;
			
 
				 	int result;
			
 
				 
			
 
				+	prefetchw(&v->counter);
			
 
				 	__asm__ __volatile__("@ atomic_add\n"
			
 
				 "1:	ldrex	%0, [%3]\n"
			
 
				 "	add	%0, %0, %4\n"
			
@@ -79,6 +81,7 @@ static inline void atomic_sub(int i, atomic_t *v)
 
				 	unsigned long tmp;
			
 
				 	int result;
			
 
				 
			
 
				+	prefetchw(&v->counter);
			
 
				 	__asm__ __volatile__("@ atomic_sub\n"
			
 
				 "1:	ldrex	%0, [%3]\n"
			
 
				 "	sub	%0, %0, %4\n"
			
@@ -114,7 +117,8 @@ static inline int atomic_sub_return(int i, atomic_t *v)
 
				 
			
 
				 static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
			
 
				 {
			
 
				-	unsigned long oldval, res;
			
 
				+	int oldval;
			
 
				+	unsigned long res;
			
 
				 
			
 
				 	smp_mb();
			
 
				 
			
@@ -134,21 +138,6 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
 
				 	return oldval;
			
 
				 }
			
 
				 
			
 
				-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
			
 
				-{
			
 
				-	unsigned long tmp, tmp2;
			
 
				-
			
 
				-	__asm__ __volatile__("@ atomic_clear_mask\n"
			
 
				-"1:	ldrex	%0, [%3]\n"
			
 
				-"	bic	%0, %0, %4\n"
			
 
				-"	strex	%1, %0, [%3]\n"
			
 
				-"	teq	%1, #0\n"
			
 
				-"	bne	1b"
			
 
				-	: "=&r" (tmp), "=&r" (tmp2), "+Qo" (*addr)
			
 
				-	: "r" (addr), "Ir" (mask)
			
 
				-	: "cc");
			
 
				-}
			
 
				-
			
 
				 #else /* ARM_ARCH_6 */
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
@@ -197,15 +186,6 @@ static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr)
			
 
				-{
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	raw_local_irq_save(flags);
			
 
				-	*addr &= ~mask;
			
 
				-	raw_local_irq_restore(flags);
			
 
				-}
			
 
				-
			
 
				 #endif /* __LINUX_ARM_ARCH__ */
			
 
				 
			
 
				 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
			
@@ -238,15 +218,15 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 
				 
			
 
				 #ifndef CONFIG_GENERIC_ATOMIC64
			
 
				 typedef struct {
			
 
				-	u64 __aligned(8) counter;
			
 
				+	long long counter;
			
 
				 } atomic64_t;
			
 
				 
			
 
				 #define ATOMIC64_INIT(i) { (i) }
			
 
				 
			
 
				 #ifdef CONFIG_ARM_LPAE
			
 
				-static inline u64 atomic64_read(const atomic64_t *v)
			
 
				+static inline long long atomic64_read(const atomic64_t *v)
			
 
				 {
			
 
				-	u64 result;
			
 
				+	long long result;
			
 
				 
			
 
				 	__asm__ __volatile__("@ atomic64_read\n"
			
 
				 "	ldrd	%0, %H0, [%1]"
			
@@ -257,7 +237,7 @@ static inline u64 atomic64_read(const atomic64_t *v)
 
				 	return result;
			
 
				 }
			
 
				 
			
 
				-static inline void atomic64_set(atomic64_t *v, u64 i)
			
 
				+static inline void atomic64_set(atomic64_t *v, long long i)
			
 
				 {
			
 
				 	__asm__ __volatile__("@ atomic64_set\n"
			
 
				 "	strd	%2, %H2, [%1]"
			
@@ -266,9 +246,9 @@ static inline void atomic64_set(atomic64_t *v, u64 i)
 
				 	);
			
 
				 }
			
 
				 #else
			
 
				-static inline u64 atomic64_read(const atomic64_t *v)
			
 
				+static inline long long atomic64_read(const atomic64_t *v)
			
 
				 {
			
 
				-	u64 result;
			
 
				+	long long result;
			
 
				 
			
 
				 	__asm__ __volatile__("@ atomic64_read\n"
			
 
				 "	ldrexd	%0, %H0, [%1]"
			
@@ -279,10 +259,11 @@ static inline u64 atomic64_read(const atomic64_t *v)
 
				 	return result;
			
 
				 }
			
 
				 
			
 
				-static inline void atomic64_set(atomic64_t *v, u64 i)
			
 
				+static inline void atomic64_set(atomic64_t *v, long long i)
			
 
				 {
			
 
				-	u64 tmp;
			
 
				+	long long tmp;
			
 
				 
			
 
				+	prefetchw(&v->counter);
			
 
				 	__asm__ __volatile__("@ atomic64_set\n"
			
 
				 "1:	ldrexd	%0, %H0, [%2]\n"
			
 
				 "	strexd	%0, %3, %H3, [%2]\n"
			
@@ -294,15 +275,16 @@ static inline void atomic64_set(atomic64_t *v, u64 i)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static inline void atomic64_add(u64 i, atomic64_t *v)
			
 
				+static inline void atomic64_add(long long i, atomic64_t *v)
			
 
				 {
			
 
				-	u64 result;
			
 
				+	long long result;
			
 
				 	unsigned long tmp;
			
 
				 
			
 
				+	prefetchw(&v->counter);
			
 
				 	__asm__ __volatile__("@ atomic64_add\n"
			
 
				 "1:	ldrexd	%0, %H0, [%3]\n"
			
 
				-"	adds	%0, %0, %4\n"
			
 
				-"	adc	%H0, %H0, %H4\n"
			
 
				+"	adds	%Q0, %Q0, %Q4\n"
			
 
				+"	adc	%R0, %R0, %R4\n"
			
 
				 "	strexd	%1, %0, %H0, [%3]\n"
			
 
				 "	teq	%1, #0\n"
			
 
				 "	bne	1b"
			
@@ -311,17 +293,17 @@ static inline void atomic64_add(u64 i, atomic64_t *v)
 
				 	: "cc");
			
 
				 }
			
 
				 
			
 
				-static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
			
 
				+static inline long long atomic64_add_return(long long i, atomic64_t *v)
			
 
				 {
			
 
				-	u64 result;
			
 
				+	long long result;
			
 
				 	unsigned long tmp;
			
 
				 
			
 
				 	smp_mb();
			
 
				 
			
 
				 	__asm__ __volatile__("@ atomic64_add_return\n"
			
 
				 "1:	ldrexd	%0, %H0, [%3]\n"
			
 
				-"	adds	%0, %0, %4\n"
			
 
				-"	adc	%H0, %H0, %H4\n"
			
 
				+"	adds	%Q0, %Q0, %Q4\n"
			
 
				+"	adc	%R0, %R0, %R4\n"
			
 
				 "	strexd	%1, %0, %H0, [%3]\n"
			
 
				 "	teq	%1, #0\n"
			
 
				 "	bne	1b"
			
@@ -334,15 +316,16 @@ static inline u64 atomic64_add_return(u64 i, atomic64_t *v)
 
				 	return result;
			
 
				 }
			
 
				 
			
 
				-static inline void atomic64_sub(u64 i, atomic64_t *v)
			
 
				+static inline void atomic64_sub(long long i, atomic64_t *v)
			
 
				 {
			
 
				-	u64 result;
			
 
				+	long long result;
			
 
				 	unsigned long tmp;
			
 
				 
			
 
				+	prefetchw(&v->counter);
			
 
				 	__asm__ __volatile__("@ atomic64_sub\n"
			
 
				 "1:	ldrexd	%0, %H0, [%3]\n"
			
 
				-"	subs	%0, %0, %4\n"
			
 
				-"	sbc	%H0, %H0, %H4\n"
			
 
				+"	subs	%Q0, %Q0, %Q4\n"
			
 
				+"	sbc	%R0, %R0, %R4\n"
			
 
				 "	strexd	%1, %0, %H0, [%3]\n"
			
 
				 "	teq	%1, #0\n"
			
 
				 "	bne	1b"
			
@@ -351,17 +334,17 @@ static inline void atomic64_sub(u64 i, atomic64_t *v)
 
				 	: "cc");
			
 
				 }
			
 
				 
			
 
				-static inline u64 atomic64_sub_return(u64 i, atomic64_t *v)
			
 
				+static inline long long atomic64_sub_return(long long i, atomic64_t *v)
			
 
				 {
			
 
				-	u64 result;
			
 
				+	long long result;
			
 
				 	unsigned long tmp;
			
 
				 
			
 
				 	smp_mb();
			
 
				 
			
 
				 	__asm__ __volatile__("@ atomic64_sub_return\n"
			
 
				 "1:	ldrexd	%0, %H0, [%3]\n"
			
 
				-"	subs	%0, %0, %4\n"
			
 
				-"	sbc	%H0, %H0, %H4\n"
			
 
				+"	subs	%Q0, %Q0, %Q4\n"
			
 
				+"	sbc	%R0, %R0, %R4\n"
			
 
				 "	strexd	%1, %0, %H0, [%3]\n"
			
 
				 "	teq	%1, #0\n"
			
 
				 "	bne	1b"
			
@@ -374,9 +357,10 @@ static inline u64 atomic64_sub_return(u64 i, atomic64_t *v)
 
				 	return result;
			
 
				 }
			
 
				 
			
 
				-static inline u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old, u64 new)
			
 
				+static inline long long atomic64_cmpxchg(atomic64_t *ptr, long long old,
			
 
				+					long long new)
			
 
				 {
			
 
				-	u64 oldval;
			
 
				+	long long oldval;
			
 
				 	unsigned long res;
			
 
				 
			
 
				 	smp_mb();
			
@@ -398,9 +382,9 @@ static inline u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old, u64 new)
 
				 	return oldval;
			
 
				 }
			
 
				 
			
 
				-static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new)
			
 
				+static inline long long atomic64_xchg(atomic64_t *ptr, long long new)
			
 
				 {
			
 
				-	u64 result;
			
 
				+	long long result;
			
 
				 	unsigned long tmp;
			
 
				 
			
 
				 	smp_mb();
			
@@ -419,18 +403,18 @@ static inline u64 atomic64_xchg(atomic64_t *ptr, u64 new)
 
				 	return result;
			
 
				 }
			
 
				 
			
 
				-static inline u64 atomic64_dec_if_positive(atomic64_t *v)
			
 
				+static inline long long atomic64_dec_if_positive(atomic64_t *v)
			
 
				 {
			
 
				-	u64 result;
			
 
				+	long long result;
			
 
				 	unsigned long tmp;
			
 
				 
			
 
				 	smp_mb();
			
 
				 
			
 
				 	__asm__ __volatile__("@ atomic64_dec_if_positive\n"
			
 
				 "1:	ldrexd	%0, %H0, [%3]\n"
			
 
				-"	subs	%0, %0, #1\n"
			
 
				-"	sbc	%H0, %H0, #0\n"
			
 
				-"	teq	%H0, #0\n"
			
 
				+"	subs	%Q0, %Q0, #1\n"
			
 
				+"	sbc	%R0, %R0, #0\n"
			
 
				+"	teq	%R0, #0\n"
			
 
				 "	bmi	2f\n"
			
 
				 "	strexd	%1, %0, %H0, [%3]\n"
			
 
				 "	teq	%1, #0\n"
			
@@ -445,9 +429,9 @@ static inline u64 atomic64_dec_if_positive(atomic64_t *v)
 
				 	return result;
			
 
				 }
			
 
				 
			
 
				-static inline int atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
			
 
				+static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
			
 
				 {
			
 
				-	u64 val;
			
 
				+	long long val;
			
 
				 	unsigned long tmp;
			
 
				 	int ret = 1;
			
 
				 
			
@@ -459,8 +443,8 @@ static inline int atomic64_add_unless(atomic64_t *v, u64 a, u64 u)
 
				 "	teqeq	%H0, %H5\n"
			
 
				 "	moveq	%1, #0\n"
			
 
				 "	beq	2f\n"
			
 
				-"	adds	%0, %0, %6\n"
			
 
				-"	adc	%H0, %H0, %H6\n"
			
 
				+"	adds	%Q0, %Q0, %Q6\n"
			
 
				+"	adc	%R0, %R0, %R6\n"
			
 
				 "	strexd	%2, %0, %H0, [%4]\n"
			
 
				 "	teq	%2, #0\n"
			
 
				 "	bne	1b\n"
			
--- a/arch/arm/include/asm/bL_switcher.h
+++ b/arch/arm/include/asm/bL_switcher.h
@@ -0,0 +1,77 @@
 
				+/*
			
 
				+ * arch/arm/include/asm/bL_switcher.h
			
 
				+ *
			
 
				+ * Created by:  Nicolas Pitre, April 2012
			
 
				+ * Copyright:   (C) 2012-2013  Linaro Limited
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#ifndef ASM_BL_SWITCHER_H
			
 
				+#define ASM_BL_SWITCHER_H
			
 
				+
			
 
				+#include <linux/compiler.h>
			
 
				+#include <linux/types.h>
			
 
				+
			
 
				+typedef void (*bL_switch_completion_handler)(void *cookie);
			
 
				+
			
 
				+int bL_switch_request_cb(unsigned int cpu, unsigned int new_cluster_id,
			
 
				+			 bL_switch_completion_handler completer,
			
 
				+			 void *completer_cookie);
			
 
				+static inline int bL_switch_request(unsigned int cpu, unsigned int new_cluster_id)
			
 
				+{
			
 
				+	return bL_switch_request_cb(cpu, new_cluster_id, NULL, NULL);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Register here to be notified about runtime enabling/disabling of
			
 
				+ * the switcher.
			
 
				+ *
			
 
				+ * The notifier chain is called with the switcher activation lock held:
			
 
				+ * the switcher will not be enabled or disabled during callbacks.
			
 
				+ * Callbacks must not call bL_switcher_{get,put}_enabled().
			
 
				+ */
			
 
				+#define BL_NOTIFY_PRE_ENABLE	0
			
 
				+#define BL_NOTIFY_POST_ENABLE	1
			
 
				+#define BL_NOTIFY_PRE_DISABLE	2
			
 
				+#define BL_NOTIFY_POST_DISABLE	3
			
 
				+
			
 
				+#ifdef CONFIG_BL_SWITCHER
			
 
				+
			
 
				+int bL_switcher_register_notifier(struct notifier_block *nb);
			
 
				+int bL_switcher_unregister_notifier(struct notifier_block *nb);
			
 
				+
			
 
				+/*
			
 
				+ * Use these functions to temporarily prevent enabling/disabling of
			
 
				+ * the switcher.
			
 
				+ * bL_switcher_get_enabled() returns true if the switcher is currently
			
 
				+ * enabled.  Each call to bL_switcher_get_enabled() must be followed
			
 
				+ * by a call to bL_switcher_put_enabled().  These functions are not
			
 
				+ * recursive.
			
 
				+ */
			
 
				+bool bL_switcher_get_enabled(void);
			
 
				+void bL_switcher_put_enabled(void);
			
 
				+
			
 
				+int bL_switcher_trace_trigger(void);
			
 
				+int bL_switcher_get_logical_index(u32 mpidr);
			
 
				+
			
 
				+#else
			
 
				+static inline int bL_switcher_register_notifier(struct notifier_block *nb)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int bL_switcher_unregister_notifier(struct notifier_block *nb)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline bool bL_switcher_get_enabled(void) { return false; }
			
 
				+static inline void bL_switcher_put_enabled(void) { }
			
 
				+static inline int bL_switcher_trace_trigger(void) { return 0; }
			
 
				+static inline int bL_switcher_get_logical_index(u32 mpidr) { return -EUNATCH; }
			
 
				+#endif /* CONFIG_BL_SWITCHER */
			
 
				+
			
 
				+#endif
			
--- a/arch/arm/include/asm/bug.h
+++ b/arch/arm/include/asm/bug.h
@@ -2,6 +2,8 @@
 
				 #define _ASMARM_BUG_H
			
 
				 
			
 
				 #include <linux/linkage.h>
			
 
				+#include <linux/types.h>
			
 
				+#include <asm/opcodes.h>
			
 
				 
			
 
				 #ifdef CONFIG_BUG
			
 
				 
			
@@ -12,10 +14,10 @@
 
				  */
			
 
				 #ifdef CONFIG_THUMB2_KERNEL
			
 
				 #define BUG_INSTR_VALUE 0xde02
			
 
				-#define BUG_INSTR_TYPE ".hword "
			
 
				+#define BUG_INSTR(__value) __inst_thumb16(__value)
			
 
				 #else
			
 
				 #define BUG_INSTR_VALUE 0xe7f001f2
			
 
				-#define BUG_INSTR_TYPE ".word "
			
 
				+#define BUG_INSTR(__value) __inst_arm(__value)
			
 
				 #endif
			
 
				 
			
 
				 
			
@@ -33,7 +35,7 @@
 
				 
			
 
				 #define __BUG(__file, __line, __value)				\
			
 
				 do {								\
			
 
				-	asm volatile("1:\t" BUG_INSTR_TYPE #__value "\n"	\
			
 
				+	asm volatile("1:\t" BUG_INSTR(__value) "\n"  \
			
 
				 		".pushsection .rodata.str, \"aMS\", %progbits, 1\n" \
			
 
				 		"2:\t.asciz " #__file "\n" 			\
			
 
				 		".popsection\n" 				\
			
@@ -48,7 +50,7 @@ do {								\
 
				 
			
 
				 #define __BUG(__file, __line, __value)				\
			
 
				 do {								\
			
 
				-	asm volatile(BUG_INSTR_TYPE #__value);			\
			
 
				+	asm volatile(BUG_INSTR(__value) "\n");			\
			
 
				 	unreachable();						\
			
 
				 } while (0)
			
 
				 #endif  /* CONFIG_DEBUG_BUGVERBOSE */
			
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -435,4 +435,50 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size)
 
				 #define sync_cache_w(ptr) __sync_cache_range_w(ptr, sizeof *(ptr))
			
 
				 #define sync_cache_r(ptr) __sync_cache_range_r(ptr, sizeof *(ptr))
			
 
				 
			
 
				+/*
			
 
				+ * Disabling cache access for one CPU in an ARMv7 SMP system is tricky.
			
 
				+ * To do so we must:
			
 
				+ *
			
 
				+ * - Clear the SCTLR.C bit to prevent further cache allocations
			
 
				+ * - Flush the desired level of cache
			
 
				+ * - Clear the ACTLR "SMP" bit to disable local coherency
			
 
				+ *
			
 
				+ * ... and so without any intervening memory access in between those steps,
			
 
				+ * not even to the stack.
			
 
				+ *
			
 
				+ * WARNING -- After this has been called:
			
 
				+ *
			
 
				+ * - No ldrex/strex (and similar) instructions must be used.
			
 
				+ * - The CPU is obviously no longer coherent with the other CPUs.
			
 
				+ * - This is unlikely to work as expected if Linux is running non-secure.
			
 
				+ *
			
 
				+ * Note:
			
 
				+ *
			
 
				+ * - This is known to apply to several ARMv7 processor implementations,
			
 
				+ *   however some exceptions may exist.  Caveat emptor.
			
 
				+ *
			
 
				+ * - The clobber list is dictated by the call to v7_flush_dcache_*.
			
 
				+ *   fp is preserved to the stack explicitly prior disabling the cache
			
 
				+ *   since adding it to the clobber list is incompatible with having
			
 
				+ *   CONFIG_FRAME_POINTER=y.  ip is saved as well if ever r12-clobbering
			
 
				+ *   trampoline are inserted by the linker and to keep sp 64-bit aligned.
			
 
				+ */
			
 
				+#define v7_exit_coherency_flush(level) \
			
 
				+	asm volatile( \
			
 
				+	"stmfd	sp!, {fp, ip} \n\t" \
			
 
				+	"mrc	p15, 0, r0, c1, c0, 0	@ get SCTLR \n\t" \
			
 
				+	"bic	r0, r0, #"__stringify(CR_C)" \n\t" \
			
 
				+	"mcr	p15, 0, r0, c1, c0, 0	@ set SCTLR \n\t" \
			
 
				+	"isb	\n\t" \
			
 
				+	"bl	v7_flush_dcache_"__stringify(level)" \n\t" \
			
 
				+	"clrex	\n\t" \
			
 
				+	"mrc	p15, 0, r0, c1, c0, 1	@ get ACTLR \n\t" \
			
 
				+	"bic	r0, r0, #(1 << 6)	@ disable local coherency \n\t" \
			
 
				+	"mcr	p15, 0, r0, c1, c0, 1	@ set ACTLR \n\t" \
			
 
				+	"isb	\n\t" \
			
 
				+	"dsb	\n\t" \
			
 
				+	"ldmfd	sp!, {fp, ip}" \
			
 
				+	: : : "r0","r1","r2","r3","r4","r5","r6","r7", \
			
 
				+	      "r9","r10","lr","memory" )
			
 
				+
			
 
				 #endif
			
--- a/arch/arm/include/asm/cmpxchg.h
+++ b/arch/arm/include/asm/cmpxchg.h
@@ -223,6 +223,42 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static inline unsigned long long __cmpxchg64(unsigned long long *ptr,
			
 
				+					     unsigned long long old,
			
 
				+					     unsigned long long new)
			
 
				+{
			
 
				+	unsigned long long oldval;
			
 
				+	unsigned long res;
			
 
				+
			
 
				+	__asm__ __volatile__(
			
 
				+"1:	ldrexd		%1, %H1, [%3]\n"
			
 
				+"	teq		%1, %4\n"
			
 
				+"	teqeq		%H1, %H4\n"
			
 
				+"	bne		2f\n"
			
 
				+"	strexd		%0, %5, %H5, [%3]\n"
			
 
				+"	teq		%0, #0\n"
			
 
				+"	bne		1b\n"
			
 
				+"2:"
			
 
				+	: "=&r" (res), "=&r" (oldval), "+Qo" (*ptr)
			
 
				+	: "r" (ptr), "r" (old), "r" (new)
			
 
				+	: "cc");
			
 
				+
			
 
				+	return oldval;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long long __cmpxchg64_mb(unsigned long long *ptr,
			
 
				+						unsigned long long old,
			
 
				+						unsigned long long new)
			
 
				+{
			
 
				+	unsigned long long ret;
			
 
				+
			
 
				+	smp_mb();
			
 
				+	ret = __cmpxchg64(ptr, old, new);
			
 
				+	smp_mb();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 #define cmpxchg_local(ptr,o,n)						\
			
 
				 	((__typeof__(*(ptr)))__cmpxchg_local((ptr),			\
			
 
				 				       (unsigned long)(o),		\
			
@@ -230,18 +266,16 @@ static inline unsigned long __cmpxchg_local(volatile void *ptr,
 
				 				       sizeof(*(ptr))))
			
 
				 
			
 
				 #define cmpxchg64(ptr, o, n)						\
			
 
				-	((__typeof__(*(ptr)))atomic64_cmpxchg(container_of((ptr),	\
			
 
				-						atomic64_t,		\
			
 
				-						counter),		\
			
 
				-					      (unsigned long long)(o),	\
			
 
				-					      (unsigned long long)(n)))
			
 
				-
			
 
				-#define cmpxchg64_local(ptr, o, n)					\
			
 
				-	((__typeof__(*(ptr)))local64_cmpxchg(container_of((ptr),	\
			
 
				-						local64_t,		\
			
 
				-						a),			\
			
 
				-					     (unsigned long long)(o),	\
			
 
				-					     (unsigned long long)(n)))
			
 
				+	((__typeof__(*(ptr)))__cmpxchg64_mb((ptr),			\
			
 
				+					(unsigned long long)(o),	\
			
 
				+					(unsigned long long)(n)))
			
 
				+
			
 
				+#define cmpxchg64_relaxed(ptr, o, n)					\
			
 
				+	((__typeof__(*(ptr)))__cmpxchg64((ptr),				\
			
 
				+					(unsigned long long)(o),	\
			
 
				+					(unsigned long long)(n)))
			
 
				+
			
 
				+#define cmpxchg64_local(ptr, o, n)	cmpxchg64_relaxed((ptr), (o), (n))
			
 
				 
			
 
				 #endif	/* __LINUX_ARM_ARCH__ >= 6 */
			
 
				 
			
--- a/arch/arm/include/asm/cputype.h
+++ b/arch/arm/include/asm/cputype.h
@@ -10,6 +10,7 @@
 
				 #define CPUID_TLBTYPE	3
			
 
				 #define CPUID_MPUIR	4
			
 
				 #define CPUID_MPIDR	5
			
 
				+#define CPUID_REVIDR	6
			
 
				 
			
 
				 #ifdef CONFIG_CPU_V7M
			
 
				 #define CPUID_EXT_PFR0	0x40
			
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -5,7 +5,7 @@
 
				 #include <linux/threads.h>
			
 
				 #include <asm/irq.h>
			
 
				 
			
 
				-#define NR_IPI	6
			
 
				+#define NR_IPI	8
			
 
				 
			
 
				 typedef struct {
			
 
				 	unsigned int __softirq_pending;
			
--- a/arch/arm/include/asm/hardware/coresight.h
+++ b/arch/arm/include/asm/hardware/coresight.h
@@ -24,8 +24,8 @@
 
				 #define TRACER_TIMEOUT 10000
			
 
				 
			
 
				 #define etm_writel(t, v, x) \
			
 
				-	(__raw_writel((v), (t)->etm_regs + (x)))
			
 
				-#define etm_readl(t, x) (__raw_readl((t)->etm_regs + (x)))
			
 
				+	(writel_relaxed((v), (t)->etm_regs + (x)))
			
 
				+#define etm_readl(t, x) (readl_relaxed((t)->etm_regs + (x)))
			
 
				 
			
 
				 /* CoreSight Management Registers */
			
 
				 #define CSMR_LOCKACCESS 0xfb0
			
@@ -142,8 +142,8 @@
 
				 #define ETBFF_TRIGFL		BIT(10)
			
 
				 
			
 
				 #define etb_writel(t, v, x) \
			
 
				-	(__raw_writel((v), (t)->etb_regs + (x)))
			
 
				-#define etb_readl(t, x) (__raw_readl((t)->etb_regs + (x)))
			
 
				+	(writel_relaxed((v), (t)->etb_regs + (x)))
			
 
				+#define etb_readl(t, x) (readl_relaxed((t)->etb_regs + (x)))
			
 
				 
			
 
				 #define etm_lock(t) do { etm_writel((t), 0, CSMR_LOCKACCESS); } while (0)
			
 
				 #define etm_unlock(t) \
			
--- a/arch/arm/include/asm/kgdb.h
+++ b/arch/arm/include/asm/kgdb.h
@@ -11,6 +11,7 @@
 
				 #define __ARM_KGDB_H__
			
 
				 
			
 
				 #include <linux/ptrace.h>
			
 
				+#include <asm/opcodes.h>
			
 
				 
			
 
				 /*
			
 
				  * GDB assumes that we're a user process being debugged, so
			
@@ -41,7 +42,7 @@
 
				 
			
 
				 static inline void arch_kgdb_breakpoint(void)
			
 
				 {
			
 
				-	asm(".word 0xe7ffdeff");
			
 
				+	asm(__inst_arm(0xe7ffdeff));
			
 
				 }
			
 
				 
			
 
				 extern void kgdb_handle_bus_error(void);
			
--- a/arch/arm/include/asm/mach/arch.h
+++ b/arch/arm/include/asm/mach/arch.h
@@ -49,6 +49,7 @@ struct machine_desc {
 
				 	bool			(*smp_init)(void);
			
 
				 	void			(*fixup)(struct tag *, char **,
			
 
				 					 struct meminfo *);
			
 
				+	void			(*init_meminfo)(void);
			
 
				 	void			(*reserve)(void);/* reserve mem blocks	*/
			
 
				 	void			(*map_io)(void);/* IO mapping function	*/
			
 
				 	void			(*init_early)(void);
			
--- a/arch/arm/include/asm/mcpm.h
+++ b/arch/arm/include/asm/mcpm.h
@@ -41,6 +41,14 @@ extern void mcpm_entry_point(void);
 
				  */
			
 
				 void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr);
			
 
				 
			
 
				+/*
			
 
				+ * This sets an early poke i.e a value to be poked into some address
			
 
				+ * from very early assembly code before the CPU is ungated.  The
			
 
				+ * address must be physical, and if 0 then nothing will happen.
			
 
				+ */
			
 
				+void mcpm_set_early_poke(unsigned cpu, unsigned cluster,
			
 
				+			 unsigned long poke_phys_addr, unsigned long poke_val);
			
 
				+
			
 
				 /*
			
 
				  * CPU/cluster power operations API for higher subsystems to use.
			
 
				  */
			
@@ -81,9 +89,39 @@ int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster);
 
				  *
			
 
				  * This will return if mcpm_platform_register() has not been called
			
 
				  * previously in which case the caller should take appropriate action.
			
 
				+ *
			
 
				+ * On success, the CPU is not guaranteed to be truly halted until
			
 
				+ * mcpm_cpu_power_down_finish() subsequently returns non-zero for the
			
 
				+ * specified cpu.  Until then, other CPUs should make sure they do not
			
 
				+ * trash memory the target CPU might be executing/accessing.
			
 
				  */
			
 
				 void mcpm_cpu_power_down(void);
			
 
				 
			
 
				+/**
			
 
				+ * mcpm_cpu_power_down_finish - wait for a specified CPU to halt, and
			
 
				+ *	make sure it is powered off
			
 
				+ *
			
 
				+ * @cpu: CPU number within given cluster
			
 
				+ * @cluster: cluster number for the CPU
			
 
				+ *
			
 
				+ * Call this function to ensure that a pending powerdown has taken
			
 
				+ * effect and the CPU is safely parked before performing non-mcpm
			
 
				+ * operations that may affect the CPU (such as kexec trashing the
			
 
				+ * kernel text).
			
 
				+ *
			
 
				+ * It is *not* necessary to call this function if you only need to
			
 
				+ * serialise a pending powerdown with mcpm_cpu_power_up() or a wakeup
			
 
				+ * event.
			
 
				+ *
			
 
				+ * Do not call this function unless the specified CPU has already
			
 
				+ * called mcpm_cpu_power_down() or has committed to doing so.
			
 
				+ *
			
 
				+ * @return:
			
 
				+ *	- zero if the CPU is in a safely parked state
			
 
				+ *	- nonzero otherwise (e.g., timeout)
			
 
				+ */
			
 
				+int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster);
			
 
				+
			
 
				 /**
			
 
				  * mcpm_cpu_suspend - bring the calling CPU in a suspended state
			
 
				  *
			
@@ -126,6 +164,7 @@ int mcpm_cpu_powered_up(void);
 
				 struct mcpm_platform_ops {
			
 
				 	int (*power_up)(unsigned int cpu, unsigned int cluster);
			
 
				 	void (*power_down)(void);
			
 
				+	int (*power_down_finish)(unsigned int cpu, unsigned int cluster);
			
 
				 	void (*suspend)(u64);
			
 
				 	void (*powered_up)(void);
			
 
				 };
			
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -172,8 +172,13 @@
 
				  * so that all we need to do is modify the 8-bit constant field.
			
 
				  */
			
 
				 #define __PV_BITS_31_24	0x81000000
			
 
				+#define __PV_BITS_7_0	0x81
			
 
				+
			
 
				+extern u64 __pv_phys_offset;
			
 
				+extern u64 __pv_offset;
			
 
				+extern void fixup_pv_table(const void *, unsigned long);
			
 
				+extern const void *__pv_table_begin, *__pv_table_end;
			
 
				 
			
 
				-extern unsigned long __pv_phys_offset;
			
 
				 #define PHYS_OFFSET __pv_phys_offset
			
 
				 
			
 
				 #define __pv_stub(from,to,instr,type)			\
			
@@ -185,22 +190,58 @@ extern unsigned long __pv_phys_offset;
 
				 	: "=r" (to)					\
			
 
				 	: "r" (from), "I" (type))
			
 
				 
			
 
				-static inline unsigned long __virt_to_phys(unsigned long x)
			
 
				+#define __pv_stub_mov_hi(t)				\
			
 
				+	__asm__ volatile("@ __pv_stub_mov\n"		\
			
 
				+	"1:	mov	%R0, %1\n"			\
			
 
				+	"	.pushsection .pv_table,\"a\"\n"		\
			
 
				+	"	.long	1b\n"				\
			
 
				+	"	.popsection\n"				\
			
 
				+	: "=r" (t)					\
			
 
				+	: "I" (__PV_BITS_7_0))
			
 
				+
			
 
				+#define __pv_add_carry_stub(x, y)			\
			
 
				+	__asm__ volatile("@ __pv_add_carry_stub\n"	\
			
 
				+	"1:	adds	%Q0, %1, %2\n"			\
			
 
				+	"	adc	%R0, %R0, #0\n"			\
			
 
				+	"	.pushsection .pv_table,\"a\"\n"		\
			
 
				+	"	.long	1b\n"				\
			
 
				+	"	.popsection\n"				\
			
 
				+	: "+r" (y)					\
			
 
				+	: "r" (x), "I" (__PV_BITS_31_24)		\
			
 
				+	: "cc")
			
 
				+
			
 
				+static inline phys_addr_t __virt_to_phys(unsigned long x)
			
 
				 {
			
 
				-	unsigned long t;
			
 
				-	__pv_stub(x, t, "add", __PV_BITS_31_24);
			
 
				+	phys_addr_t t;
			
 
				+
			
 
				+	if (sizeof(phys_addr_t) == 4) {
			
 
				+		__pv_stub(x, t, "add", __PV_BITS_31_24);
			
 
				+	} else {
			
 
				+		__pv_stub_mov_hi(t);
			
 
				+		__pv_add_carry_stub(x, t);
			
 
				+	}
			
 
				 	return t;
			
 
				 }
			
 
				 
			
 
				-static inline unsigned long __phys_to_virt(unsigned long x)
			
 
				+static inline unsigned long __phys_to_virt(phys_addr_t x)
			
 
				 {
			
 
				 	unsigned long t;
			
 
				 	__pv_stub(x, t, "sub", __PV_BITS_31_24);
			
 
				 	return t;
			
 
				 }
			
 
				+
			
 
				 #else
			
 
				-#define __virt_to_phys(x)	((x) - PAGE_OFFSET + PHYS_OFFSET)
			
 
				-#define __phys_to_virt(x)	((x) - PHYS_OFFSET + PAGE_OFFSET)
			
 
				+
			
 
				+static inline phys_addr_t __virt_to_phys(unsigned long x)
			
 
				+{
			
 
				+	return (phys_addr_t)x - PAGE_OFFSET + PHYS_OFFSET;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long __phys_to_virt(phys_addr_t x)
			
 
				+{
			
 
				+	return x - PHYS_OFFSET + PAGE_OFFSET;
			
 
				+}
			
 
				+
			
 
				 #endif
			
 
				 #endif
			
 
				 #endif /* __ASSEMBLY__ */
			
@@ -238,16 +279,33 @@ static inline phys_addr_t virt_to_phys(const volatile void *x)
 
				 
			
 
				 static inline void *phys_to_virt(phys_addr_t x)
			
 
				 {
			
 
				-	return (void *)(__phys_to_virt((unsigned long)(x)));
			
 
				+	return (void *)__phys_to_virt(x);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				  * Drivers should NOT use these either.
			
 
				  */
			
 
				 #define __pa(x)			__virt_to_phys((unsigned long)(x))
			
 
				-#define __va(x)			((void *)__phys_to_virt((unsigned long)(x)))
			
 
				+#define __va(x)			((void *)__phys_to_virt((phys_addr_t)(x)))
			
 
				 #define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
			
 
				 
			
 
				+extern phys_addr_t (*arch_virt_to_idmap)(unsigned long x);
			
 
				+
			
 
				+/*
			
 
				+ * These are for systems that have a hardware interconnect supported alias of
			
 
				+ * physical memory for idmap purposes.  Most cases should leave these
			
 
				+ * untouched.
			
 
				+ */
			
 
				+static inline phys_addr_t __virt_to_idmap(unsigned long x)
			
 
				+{
			
 
				+	if (arch_virt_to_idmap)
			
 
				+		return arch_virt_to_idmap(x);
			
 
				+	else
			
 
				+		return __virt_to_phys(x);
			
 
				+}
			
 
				+
			
 
				+#define virt_to_idmap(x)	__virt_to_idmap((unsigned long)(x))
			
 
				+
			
 
				 /*
			
 
				  * Virtual <-> DMA view memory address translations
			
 
				  * Again, these are *only* valid on the kernel direct mapped RAM
			
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -16,7 +16,7 @@ typedef struct {
 
				 #ifdef CONFIG_CPU_HAS_ASID
			
 
				 #define ASID_BITS	8
			
 
				 #define ASID_MASK	((~0ULL) << ASID_BITS)
			
 
				-#define ASID(mm)	((mm)->context.id.counter & ~ASID_MASK)
			
 
				+#define ASID(mm)	((unsigned int)((mm)->context.id.counter & ~ASID_MASK))
			
 
				 #else
			
 
				 #define ASID(mm)	(0)
			
 
				 #endif
			
--- a/arch/arm/include/asm/pgtable-2level.h
+++ b/arch/arm/include/asm/pgtable-2level.h
@@ -181,6 +181,13 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
 
				 
			
 
				 #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
			
 
				 
			
 
				+/*
			
 
				+ * We don't have huge page support for short descriptors, for the moment
			
 
				+ * define empty stubs for use by pin_page_for_write.
			
 
				+ */
			
 
				+#define pmd_hugewillfault(pmd)	(0)
			
 
				+#define pmd_thp_or_huge(pmd)	(0)
			
 
				+
			
 
				 #endif /* __ASSEMBLY__ */
			
 
				 
			
 
				 #endif /* _ASM_PGTABLE_2LEVEL_H */
			
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -206,6 +206,9 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
 
				 #define __HAVE_ARCH_PMD_WRITE
			
 
				 #define pmd_write(pmd)		(!(pmd_val(pmd) & PMD_SECT_RDONLY))
			
 
				 
			
 
				+#define pmd_hugewillfault(pmd)	(!pmd_young(pmd) || !pmd_write(pmd))
			
 
				+#define pmd_thp_or_huge(pmd)	(pmd_huge(pmd) || pmd_trans_huge(pmd))
			
 
				+
			
 
				 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				 #define pmd_trans_huge(pmd)	(pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
			
 
				 #define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING)
			
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -22,6 +22,7 @@
 
				 #include <asm/hw_breakpoint.h>
			
 
				 #include <asm/ptrace.h>
			
 
				 #include <asm/types.h>
			
 
				+#include <asm/unified.h>
			
 
				 
			
 
				 #ifdef __KERNEL__
			
 
				 #define STACK_TOP	((current->personality & ADDR_LIMIT_32BIT) ? \
			
@@ -87,6 +88,17 @@ unsigned long get_wchan(struct task_struct *p);
 
				 #define KSTK_EIP(tsk)	task_pt_regs(tsk)->ARM_pc
			
 
				 #define KSTK_ESP(tsk)	task_pt_regs(tsk)->ARM_sp
			
 
				 
			
 
				+#ifdef CONFIG_SMP
			
 
				+#define __ALT_SMP_ASM(smp, up)						\
			
 
				+	"9998:	" smp "\n"						\
			
 
				+	"	.pushsection \".alt.smp.init\", \"a\"\n"		\
			
 
				+	"	.long	9998b\n"					\
			
 
				+	"	" up "\n"						\
			
 
				+	"	.popsection\n"
			
 
				+#else
			
 
				+#define __ALT_SMP_ASM(smp, up)	up
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * Prefetching support - only ARMv5.
			
 
				  */
			
@@ -97,17 +109,22 @@ static inline void prefetch(const void *ptr)
 
				 {
			
 
				 	__asm__ __volatile__(
			
 
				 		"pld\t%a0"
			
 
				-		:
			
 
				-		: "p" (ptr)
			
 
				-		: "cc");
			
 
				+		:: "p" (ptr));
			
 
				 }
			
 
				 
			
 
				+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
			
 
				 #define ARCH_HAS_PREFETCHW
			
 
				-#define prefetchw(ptr)	prefetch(ptr)
			
 
				-
			
 
				-#define ARCH_HAS_SPINLOCK_PREFETCH
			
 
				-#define spin_lock_prefetch(x) do { } while (0)
			
 
				-
			
 
				+static inline void prefetchw(const void *ptr)
			
 
				+{
			
 
				+	__asm__ __volatile__(
			
 
				+		".arch_extension	mp\n"
			
 
				+		__ALT_SMP_ASM(
			
 
				+			WASM(pldw)		"\t%a0",
			
 
				+			WASM(pld)		"\t%a0"
			
 
				+		)
			
 
				+		:: "p" (ptr));
			
 
				+}
			
 
				+#endif
			
 
				 #endif
			
 
				 
			
 
				 #define HAVE_ARCH_PICK_MMAP_LAYOUT
			
--- a/arch/arm/include/asm/setup.h
+++ b/arch/arm/include/asm/setup.h
@@ -49,7 +49,7 @@ extern struct meminfo meminfo;
 
				 #define bank_phys_end(bank)	((bank)->start + (bank)->size)
			
 
				 #define bank_phys_size(bank)	(bank)->size
			
 
				 
			
 
				-extern int arm_add_memory(phys_addr_t start, phys_addr_t size);
			
 
				+extern int arm_add_memory(u64 start, u64 size);
			
 
				 extern void early_print(const char *str, ...);
			
 
				 extern void dump_machine_table(void);
			
 
				 
			
--- a/arch/arm/include/asm/smp.h
+++ b/arch/arm/include/asm/smp.h
@@ -84,6 +84,8 @@ extern void arch_send_call_function_single_ipi(int cpu);
 
				 extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
			
 
				 extern void arch_send_wakeup_ipi_mask(const struct cpumask *mask);
			
 
				 
			
 
				+extern int register_ipi_completion(struct completion *completion, int cpu);
			
 
				+
			
 
				 struct smp_operations {
			
 
				 #ifdef CONFIG_SMP
			
 
				 	/*
			
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -5,21 +5,13 @@
 
				 #error SMP not supported on pre-ARMv6 CPUs
			
 
				 #endif
			
 
				 
			
 
				-#include <asm/processor.h>
			
 
				+#include <linux/prefetch.h>
			
 
				 
			
 
				 /*
			
 
				  * sev and wfe are ARMv6K extensions.  Uniprocessor ARMv6 may not have the K
			
 
				  * extensions, so when running on UP, we have to patch these instructions away.
			
 
				  */
			
 
				-#define ALT_SMP(smp, up)					\
			
 
				-	"9998:	" smp "\n"					\
			
 
				-	"	.pushsection \".alt.smp.init\", \"a\"\n"	\
			
 
				-	"	.long	9998b\n"				\
			
 
				-	"	" up "\n"					\
			
 
				-	"	.popsection\n"
			
 
				-
			
 
				 #ifdef CONFIG_THUMB2_KERNEL
			
 
				-#define SEV		ALT_SMP("sev.w", "nop.w")
			
 
				 /*
			
 
				  * For Thumb-2, special care is needed to ensure that the conditional WFE
			
 
				  * instruction really does assemble to exactly 4 bytes (as required by
			
@@ -31,17 +23,18 @@
 
				  * the assembler won't change IT instructions which are explicitly present
			
 
				  * in the input.
			
 
				  */
			
 
				-#define WFE(cond)	ALT_SMP(		\
			
 
				+#define WFE(cond)	__ALT_SMP_ASM(		\
			
 
				 	"it " cond "\n\t"			\
			
 
				 	"wfe" cond ".n",			\
			
 
				 						\
			
 
				 	"nop.w"					\
			
 
				 )
			
 
				 #else
			
 
				-#define SEV		ALT_SMP("sev", "nop")
			
 
				-#define WFE(cond)	ALT_SMP("wfe" cond, "nop")
			
 
				+#define WFE(cond)	__ALT_SMP_ASM("wfe" cond, "nop")
			
 
				 #endif
			
 
				 
			
 
				+#define SEV		__ALT_SMP_ASM(WASM(sev), WASM(nop))
			
 
				+
			
 
				 static inline void dsb_sev(void)
			
 
				 {
			
 
				 #if __LINUX_ARM_ARCH__ >= 7
			
@@ -77,6 +70,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 
				 	u32 newval;
			
 
				 	arch_spinlock_t lockval;
			
 
				 
			
 
				+	prefetchw(&lock->slock);
			
 
				 	__asm__ __volatile__(
			
 
				 "1:	ldrex	%0, [%3]\n"
			
 
				 "	add	%1, %0, %4\n"
			
@@ -100,6 +94,7 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 
				 	unsigned long contended, res;
			
 
				 	u32 slock;
			
 
				 
			
 
				+	prefetchw(&lock->slock);
			
 
				 	do {
			
 
				 		__asm__ __volatile__(
			
 
				 		"	ldrex	%0, [%3]\n"
			
@@ -127,10 +122,14 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 
				 	dsb_sev();
			
 
				 }
			
 
				 
			
 
				+static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
			
 
				+{
			
 
				+	return lock.tickets.owner == lock.tickets.next;
			
 
				+}
			
 
				+
			
 
				 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
			
 
				 {
			
 
				-	struct __raw_tickets tickets = ACCESS_ONCE(lock->tickets);
			
 
				-	return tickets.owner != tickets.next;
			
 
				+	return !arch_spin_value_unlocked(ACCESS_ONCE(*lock));
			
 
				 }
			
 
				 
			
 
				 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
			
@@ -152,6 +151,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
 
				 {
			
 
				 	unsigned long tmp;
			
 
				 
			
 
				+	prefetchw(&rw->lock);
			
 
				 	__asm__ __volatile__(
			
 
				 "1:	ldrex	%0, [%1]\n"
			
 
				 "	teq	%0, #0\n"
			
@@ -170,6 +170,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
 
				 {
			
 
				 	unsigned long contended, res;
			
 
				 
			
 
				+	prefetchw(&rw->lock);
			
 
				 	do {
			
 
				 		__asm__ __volatile__(
			
 
				 		"	ldrex	%0, [%2]\n"
			
@@ -203,7 +204,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 
				 }
			
 
				 
			
 
				 /* write_can_lock - would write_trylock() succeed? */
			
 
				-#define arch_write_can_lock(x)		((x)->lock == 0)
			
 
				+#define arch_write_can_lock(x)		(ACCESS_ONCE((x)->lock) == 0)
			
 
				 
			
 
				 /*
			
 
				  * Read locks are a bit more hairy:
			
@@ -221,6 +222,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
 
				 {
			
 
				 	unsigned long tmp, tmp2;
			
 
				 
			
 
				+	prefetchw(&rw->lock);
			
 
				 	__asm__ __volatile__(
			
 
				 "1:	ldrex	%0, [%2]\n"
			
 
				 "	adds	%0, %0, #1\n"
			
@@ -241,6 +243,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
 
				 
			
 
				 	smp_mb();
			
 
				 
			
 
				+	prefetchw(&rw->lock);
			
 
				 	__asm__ __volatile__(
			
 
				 "1:	ldrex	%0, [%2]\n"
			
 
				 "	sub	%0, %0, #1\n"
			
@@ -259,6 +262,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 
				 {
			
 
				 	unsigned long contended, res;
			
 
				 
			
 
				+	prefetchw(&rw->lock);
			
 
				 	do {
			
 
				 		__asm__ __volatile__(
			
 
				 		"	ldrex	%0, [%2]\n"
			
@@ -280,7 +284,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 
				 }
			
 
				 
			
 
				 /* read_can_lock - would read_trylock() succeed? */
			
 
				-#define arch_read_can_lock(x)		((x)->lock < 0x80000000)
			
 
				+#define arch_read_can_lock(x)		(ACCESS_ONCE((x)->lock) < 0x80000000)
			
 
				 
			
 
				 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
			
 
				 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
			
--- a/arch/arm/include/asm/spinlock_types.h
+++ b/arch/arm/include/asm/spinlock_types.h
@@ -25,7 +25,7 @@ typedef struct {
 
				 #define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
			
 
				 
			
 
				 typedef struct {
			
 
				-	volatile unsigned int lock;
			
 
				+	u32 lock;
			
 
				 } arch_rwlock_t;
			
 
				 
			
 
				 #define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
			
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -560,37 +560,6 @@ static inline void __flush_bp_all(void)
 
				 		asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero));
			
 
				 }
			
 
				 
			
 
				-#include <asm/cputype.h>
			
 
				-#ifdef CONFIG_ARM_ERRATA_798181
			
 
				-static inline int erratum_a15_798181(void)
			
 
				-{
			
 
				-	unsigned int midr = read_cpuid_id();
			
 
				-
			
 
				-	/* Cortex-A15 r0p0..r3p2 affected */
			
 
				-	if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2)
			
 
				-		return 0;
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-static inline void dummy_flush_tlb_a15_erratum(void)
			
 
				-{
			
 
				-	/*
			
 
				-	 * Dummy TLBIMVAIS. Using the unmapped address 0 and ASID 0.
			
 
				-	 */
			
 
				-	asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
			
 
				-	dsb(ish);
			
 
				-}
			
 
				-#else
			
 
				-static inline int erratum_a15_798181(void)
			
 
				-{
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static inline void dummy_flush_tlb_a15_erratum(void)
			
 
				-{
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				 /*
			
 
				  *	flush_pmd_entry
			
 
				  *
			
@@ -697,4 +666,21 @@ extern void flush_bp_all(void);
 
				 
			
 
				 #endif
			
 
				 
			
 
				+#ifndef __ASSEMBLY__
			
 
				+#ifdef CONFIG_ARM_ERRATA_798181
			
 
				+extern void erratum_a15_798181_init(void);
			
 
				+#else
			
 
				+static inline void erratum_a15_798181_init(void) {}
			
 
				+#endif
			
 
				+extern bool (*erratum_a15_798181_handler)(void);
			
 
				+
			
 
				+static inline bool erratum_a15_798181(void)
			
 
				+{
			
 
				+	if (unlikely(IS_ENABLED(CONFIG_ARM_ERRATA_798181) &&
			
 
				+		erratum_a15_798181_handler))
			
 
				+		return erratum_a15_798181_handler();
			
 
				+	return false;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 #endif
			
--- a/arch/arm/include/asm/unified.h
+++ b/arch/arm/include/asm/unified.h
@@ -38,6 +38,8 @@
 
				 #ifdef __ASSEMBLY__
			
 
				 #define W(instr)	instr.w
			
 
				 #define BSYM(sym)	sym + 1
			
 
				+#else
			
 
				+#define WASM(instr)	#instr ".w"
			
 
				 #endif
			
 
				 
			
 
				 #else	/* !CONFIG_THUMB2_KERNEL */
			
@@ -50,6 +52,8 @@
 
				 #ifdef __ASSEMBLY__
			
 
				 #define W(instr)	instr
			
 
				 #define BSYM(sym)	sym
			
 
				+#else
			
 
				+#define WASM(instr)	#instr
			
 
				 #endif
			
 
				 
			
 
				 #endif	/* CONFIG_THUMB2_KERNEL */
			
--- a/arch/arm/include/debug/efm32.S
+++ b/arch/arm/include/debug/efm32.S
@@ -0,0 +1,45 @@
 
				+/*
			
 
				+ * Copyright (C) 2013 Pengutronix
			
 
				+ * Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#define UARTn_CMD		0x000c
			
 
				+#define UARTn_CMD_TXEN			0x0004
			
 
				+
			
 
				+#define	UARTn_STATUS		0x0010
			
 
				+#define	UARTn_STATUS_TXC		0x0020
			
 
				+#define	UARTn_STATUS_TXBL		0x0040
			
 
				+
			
 
				+#define	UARTn_TXDATA		0x0034
			
 
				+
			
 
				+		.macro	addruart, rx, tmp
			
 
				+		ldr	\rx, =(CONFIG_DEBUG_UART_PHYS)
			
 
				+
			
 
				+		/*
			
 
				+		 * enable TX. The driver might disable it to save energy. We
			
 
				+		 * don't care about disabling at the end as during debug power
			
 
				+		 * consumption isn't that important.
			
 
				+		 */
			
 
				+		ldr	\tmp, =(UARTn_CMD_TXEN)
			
 
				+		str	\tmp, [\rx, #UARTn_CMD]
			
 
				+		.endm
			
 
				+
			
 
				+		.macro	senduart,rd,rx
			
 
				+		strb	\rd, [\rx, #UARTn_TXDATA]
			
 
				+		.endm
			
 
				+
			
 
				+		.macro	waituart,rd,rx
			
 
				+1001:		ldr	\rd, [\rx, #UARTn_STATUS]
			
 
				+		tst	\rd, #UARTn_STATUS_TXBL
			
 
				+		beq	1001b
			
 
				+		.endm
			
 
				+
			
 
				+		.macro	busyuart,rd,rx
			
 
				+1001:		ldr	\rd, [\rx, UARTn_STATUS]
			
 
				+		tst	\rd, #UARTn_STATUS_TXC
			
 
				+		bne	1001b
			
 
				+		.endm
			
--- a/arch/arm/include/debug/msm.S
+++ b/arch/arm/include/debug/msm.S
@@ -44,6 +44,11 @@
 
				 #ifdef CONFIG_DEBUG_MSM8960_UART
			
 
				 #define MSM_DEBUG_UART_BASE	0xF0040000
			
 
				 #define MSM_DEBUG_UART_PHYS	0x16440000
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_DEBUG_MSM8974_UART
			
 
				+#define MSM_DEBUG_UART_BASE	0xFA71E000
			
 
				+#define MSM_DEBUG_UART_PHYS	0xF991E000
			
 
				 #endif
			
 
				 
			
 
				 	.macro	addruart, rp, rv, tmp
			
--- a/arch/arm/include/debug/pl01x.S
+++ b/arch/arm/include/debug/pl01x.S
@@ -25,12 +25,14 @@
 
				 
			
 
				 		.macro	waituart,rd,rx
			
 
				 1001:		ldr	\rd, [\rx, #UART01x_FR]
			
 
				+ ARM_BE8(	rev	\rd, \rd )
			
 
				 		tst	\rd, #UART01x_FR_TXFF
			
 
				 		bne	1001b
			
 
				 		.endm
			
 
				 
			
 
				 		.macro	busyuart,rd,rx
			
 
				 1001:		ldr	\rd, [\rx, #UART01x_FR]
			
 
				+ ARM_BE8(	rev	\rd, \rd )
			
 
				 		tst	\rd, #UART01x_FR_BUSY
			
 
				 		bne	1001b
			
 
				 		.endm
			
--- a/arch/arm/include/uapi/asm/Kbuild
+++ b/arch/arm/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ header-y += hwcap.h
 
				 header-y += ioctls.h
			
 
				 header-y += kvm_para.h
			
 
				 header-y += mman.h
			
 
				+header-y += perf_regs.h
			
 
				 header-y += posix_types.h
			
 
				 header-y += ptrace.h
			
 
				 header-y += setup.h
			
--- a/arch/arm/include/uapi/asm/perf_regs.h
+++ b/arch/arm/include/uapi/asm/perf_regs.h
@@ -0,0 +1,23 @@
 
				+#ifndef _ASM_ARM_PERF_REGS_H
			
 
				+#define _ASM_ARM_PERF_REGS_H
			
 
				+
			
 
				+enum perf_event_arm_regs {
			
 
				+	PERF_REG_ARM_R0,
			
 
				+	PERF_REG_ARM_R1,
			
 
				+	PERF_REG_ARM_R2,
			
 
				+	PERF_REG_ARM_R3,
			
 
				+	PERF_REG_ARM_R4,
			
 
				+	PERF_REG_ARM_R5,
			
 
				+	PERF_REG_ARM_R6,
			
 
				+	PERF_REG_ARM_R7,
			
 
				+	PERF_REG_ARM_R8,
			
 
				+	PERF_REG_ARM_R9,
			
 
				+	PERF_REG_ARM_R10,
			
 
				+	PERF_REG_ARM_FP,
			
 
				+	PERF_REG_ARM_IP,
			
 
				+	PERF_REG_ARM_SP,
			
 
				+	PERF_REG_ARM_LR,
			
 
				+	PERF_REG_ARM_PC,
			
 
				+	PERF_REG_ARM_MAX,
			
 
				+};
			
 
				+#endif /* _ASM_ARM_PERF_REGS_H */
			
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -17,7 +17,8 @@ CFLAGS_REMOVE_return_address.o = -pg
 
				 
			
 
				 obj-y		:= elf.o entry-common.o irq.o opcodes.o \
			
 
				 		   process.o ptrace.o return_address.o \
			
 
				-		   setup.o signal.o stacktrace.o sys_arm.o time.o traps.o
			
 
				+		   setup.o signal.o sigreturn_codes.o \
			
 
				+		   stacktrace.o sys_arm.o time.o traps.o
			
 
				 
			
 
				 obj-$(CONFIG_ATAGS)		+= atags_parse.o
			
 
				 obj-$(CONFIG_ATAGS_PROC)	+= atags_proc.o
			
@@ -78,6 +79,7 @@ obj-$(CONFIG_CPU_XSC3)		+= xscale-cp0.o
 
				 obj-$(CONFIG_CPU_MOHAWK)	+= xscale-cp0.o
			
 
				 obj-$(CONFIG_CPU_PJ4)		+= pj4-cp0.o
			
 
				 obj-$(CONFIG_IWMMXT)		+= iwmmxt.o
			
 
				+obj-$(CONFIG_PERF_EVENTS)	+= perf_regs.o
			
 
				 obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o perf_event_cpu.o
			
 
				 AFLAGS_iwmmxt.o			:= -Wa,-mcpu=iwmmxt
			
 
				 obj-$(CONFIG_ARM_CPU_TOPOLOGY)  += topology.o
			
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -155,4 +155,5 @@ EXPORT_SYMBOL(__gnu_mcount_nc);
 
				 
			
 
				 #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
			
 
				 EXPORT_SYMBOL(__pv_phys_offset);
			
 
				+EXPORT_SYMBOL(__pv_offset);
			
 
				 #endif
			
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -192,6 +192,7 @@ __dabt_svc:
 
				 	svc_entry
			
 
				 	mov	r2, sp
			
 
				 	dabt_helper
			
 
				+ THUMB(	ldr	r5, [sp, #S_PSR]	)	@ potentially updated CPSR
			
 
				 	svc_exit r5				@ return from exception
			
 
				  UNWIND(.fnend		)
			
 
				 ENDPROC(__dabt_svc)
			
@@ -416,9 +417,8 @@ __und_usr:
 
				 	bne	__und_usr_thumb
			
 
				 	sub	r4, r2, #4			@ ARM instr at LR - 4
			
 
				 1:	ldrt	r0, [r4]
			
 
				-#ifdef CONFIG_CPU_ENDIAN_BE8
			
 
				-	rev	r0, r0				@ little endian instruction
			
 
				-#endif
			
 
				+ ARM_BE8(rev	r0, r0)				@ little endian instruction
			
 
				+
			
 
				 	@ r0 = 32-bit ARM instruction which caused the exception
			
 
				 	@ r2 = PC value for the following instruction (:= regs->ARM_pc)
			
 
				 	@ r4 = PC value for the faulting instruction
			
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -393,9 +393,7 @@ ENTRY(vector_swi)
 
				 #else
			
 
				  USER(	ldr	r10, [lr, #-4]		)	@ get SWI instruction
			
 
				 #endif
			
 
				-#ifdef CONFIG_CPU_ENDIAN_BE8
			
 
				-	rev	r10, r10			@ little endian instruction
			
 
				-#endif
			
 
				+ ARM_BE8(rev	r10, r10)			@ little endian instruction
			
 
				 
			
 
				 #elif defined(CONFIG_AEABI)
			
 
				 
			
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -77,6 +77,7 @@
 
				 
			
 
				 	__HEAD
			
 
				 ENTRY(stext)
			
 
				+ ARM_BE8(setend	be )			@ ensure we are in BE8 mode
			
 
				 
			
 
				  THUMB(	adr	r9, BSYM(1f)	)	@ Kernel is always entered in ARM.
			
 
				  THUMB(	bx	r9		)	@ If this is a Thumb-2 kernel,
			
@@ -352,6 +353,9 @@ ENTRY(secondary_startup)
 
				 	 * the processor type - there is no need to check the machine type
			
 
				 	 * as it has already been validated by the primary processor.
			
 
				 	 */
			
 
				+
			
 
				+ ARM_BE8(setend	be)				@ ensure we are in BE8 mode
			
 
				+
			
 
				 #ifdef CONFIG_ARM_VIRT_EXT
			
 
				 	bl	__hyp_stub_install_secondary
			
 
				 #endif
			
@@ -555,6 +559,14 @@ ENTRY(fixup_smp)
 
				 	ldmfd	sp!, {r4 - r6, pc}
			
 
				 ENDPROC(fixup_smp)
			
 
				 
			
 
				+#ifdef __ARMEB__
			
 
				+#define LOW_OFFSET	0x4
			
 
				+#define HIGH_OFFSET	0x0
			
 
				+#else
			
 
				+#define LOW_OFFSET	0x0
			
 
				+#define HIGH_OFFSET	0x4
			
 
				+#endif
			
 
				+
			
 
				 #ifdef CONFIG_ARM_PATCH_PHYS_VIRT
			
 
				 
			
 
				 /* __fixup_pv_table - patch the stub instructions with the delta between
			
@@ -565,17 +577,20 @@ ENDPROC(fixup_smp)
 
				 	__HEAD
			
 
				 __fixup_pv_table:
			
 
				 	adr	r0, 1f
			
 
				-	ldmia	r0, {r3-r5, r7}
			
 
				-	sub	r3, r0, r3	@ PHYS_OFFSET - PAGE_OFFSET
			
 
				+	ldmia	r0, {r3-r7}
			
 
				+	mvn	ip, #0
			
 
				+	subs	r3, r0, r3	@ PHYS_OFFSET - PAGE_OFFSET
			
 
				 	add	r4, r4, r3	@ adjust table start address
			
 
				 	add	r5, r5, r3	@ adjust table end address
			
 
				-	add	r7, r7, r3	@ adjust __pv_phys_offset address
			
 
				-	str	r8, [r7]	@ save computed PHYS_OFFSET to __pv_phys_offset
			
 
				+	add	r6, r6, r3	@ adjust __pv_phys_offset address
			
 
				+	add	r7, r7, r3	@ adjust __pv_offset address
			
 
				+	str	r8, [r6, #LOW_OFFSET]	@ save computed PHYS_OFFSET to __pv_phys_offset
			
 
				+	strcc	ip, [r7, #HIGH_OFFSET]	@ save to __pv_offset high bits
			
 
				 	mov	r6, r3, lsr #24	@ constant for add/sub instructions
			
 
				 	teq	r3, r6, lsl #24 @ must be 16MiB aligned
			
 
				 THUMB(	it	ne		@ cross section branch )
			
 
				 	bne	__error
			
 
				-	str	r6, [r7, #4]	@ save to __pv_offset
			
 
				+	str	r3, [r7, #LOW_OFFSET]	@ save to __pv_offset low bits
			
 
				 	b	__fixup_a_pv_table
			
 
				 ENDPROC(__fixup_pv_table)
			
 
				 
			
@@ -584,10 +599,19 @@ ENDPROC(__fixup_pv_table)
 
				 	.long	__pv_table_begin
			
 
				 	.long	__pv_table_end
			
 
				 2:	.long	__pv_phys_offset
			
 
				+	.long	__pv_offset
			
 
				 
			
 
				 	.text
			
 
				 __fixup_a_pv_table:
			
 
				+	adr	r0, 3f
			
 
				+	ldr	r6, [r0]
			
 
				+	add	r6, r6, r3
			
 
				+	ldr	r0, [r6, #HIGH_OFFSET]	@ pv_offset high word
			
 
				+	ldr	r6, [r6, #LOW_OFFSET]	@ pv_offset low word
			
 
				+	mov	r6, r6, lsr #24
			
 
				+	cmn	r0, #1
			
 
				 #ifdef CONFIG_THUMB2_KERNEL
			
 
				+	moveq	r0, #0x200000	@ set bit 21, mov to mvn instruction
			
 
				 	lsls	r6, #24
			
 
				 	beq	2f
			
 
				 	clz	r7, r6
			
@@ -601,18 +625,42 @@ __fixup_a_pv_table:
 
				 	b	2f
			
 
				 1:	add     r7, r3
			
 
				 	ldrh	ip, [r7, #2]
			
 
				-	and	ip, 0x8f00
			
 
				-	orr	ip, r6	@ mask in offset bits 31-24
			
 
				+ARM_BE8(rev16	ip, ip)
			
 
				+	tst	ip, #0x4000
			
 
				+	and	ip, #0x8f00
			
 
				+	orrne	ip, r6	@ mask in offset bits 31-24
			
 
				+	orreq	ip, r0	@ mask in offset bits 7-0
			
 
				+ARM_BE8(rev16	ip, ip)
			
 
				 	strh	ip, [r7, #2]
			
 
				+	bne	2f
			
 
				+	ldrh	ip, [r7]
			
 
				+ARM_BE8(rev16	ip, ip)
			
 
				+	bic	ip, #0x20
			
 
				+	orr	ip, ip, r0, lsr #16
			
 
				+ARM_BE8(rev16	ip, ip)
			
 
				+	strh	ip, [r7]
			
 
				 2:	cmp	r4, r5
			
 
				 	ldrcc	r7, [r4], #4	@ use branch for delay slot
			
 
				 	bcc	1b
			
 
				 	bx	lr
			
 
				 #else
			
 
				+	moveq	r0, #0x400000	@ set bit 22, mov to mvn instruction
			
 
				 	b	2f
			
 
				 1:	ldr	ip, [r7, r3]
			
 
				+#ifdef CONFIG_CPU_ENDIAN_BE8
			
 
				+	@ in BE8, we load data in BE, but instructions still in LE
			
 
				+	bic	ip, ip, #0xff000000
			
 
				+	tst	ip, #0x000f0000	@ check the rotation field
			
 
				+	orrne	ip, ip, r6, lsl #24 @ mask in offset bits 31-24
			
 
				+	biceq	ip, ip, #0x00004000 @ clear bit 22
			
 
				+	orreq	ip, ip, r0, lsl #24 @ mask in offset bits 7-0
			
 
				+#else
			
 
				 	bic	ip, ip, #0x000000ff
			
 
				-	orr	ip, ip, r6	@ mask in offset bits 31-24
			
 
				+	tst	ip, #0xf00	@ check the rotation field
			
 
				+	orrne	ip, ip, r6	@ mask in offset bits 31-24
			
 
				+	biceq	ip, ip, #0x400000	@ clear bit 22
			
 
				+	orreq	ip, ip, r0	@ mask in offset bits 7-0
			
 
				+#endif
			
 
				 	str	ip, [r7, r3]
			
 
				 2:	cmp	r4, r5
			
 
				 	ldrcc	r7, [r4], #4	@ use branch for delay slot
			
@@ -621,28 +669,30 @@ __fixup_a_pv_table:
 
				 #endif
			
 
				 ENDPROC(__fixup_a_pv_table)
			
 
				 
			
 
				+	.align
			
 
				+3:	.long __pv_offset
			
 
				+
			
 
				 ENTRY(fixup_pv_table)
			
 
				 	stmfd	sp!, {r4 - r7, lr}
			
 
				-	ldr	r2, 2f			@ get address of __pv_phys_offset
			
 
				 	mov	r3, #0			@ no offset
			
 
				 	mov	r4, r0			@ r0 = table start
			
 
				 	add	r5, r0, r1		@ r1 = table size
			
 
				-	ldr	r6, [r2, #4]		@ get __pv_offset
			
 
				 	bl	__fixup_a_pv_table
			
 
				 	ldmfd	sp!, {r4 - r7, pc}
			
 
				 ENDPROC(fixup_pv_table)
			
 
				 
			
 
				-	.align
			
 
				-2:	.long	__pv_phys_offset
			
 
				-
			
 
				 	.data
			
 
				 	.globl	__pv_phys_offset
			
 
				 	.type	__pv_phys_offset, %object
			
 
				 __pv_phys_offset:
			
 
				-	.long	0
			
 
				-	.size	__pv_phys_offset, . - __pv_phys_offset
			
 
				+	.quad	0
			
 
				+	.size	__pv_phys_offset, . -__pv_phys_offset
			
 
				+
			
 
				+	.globl	__pv_offset
			
 
				+	.type	__pv_offset, %object
			
 
				 __pv_offset:
			
 
				-	.long	0
			
 
				+	.quad	0
			
 
				+	.size	__pv_offset, . -__pv_offset
			
 
				 #endif
			
 
				 
			
 
				 #include "head-common.S"
			
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -344,13 +344,13 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
 
				 		/* Breakpoint */
			
 
				 		ctrl_base = ARM_BASE_BCR;
			
 
				 		val_base = ARM_BASE_BVR;
			
 
				-		slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
			
 
				+		slots = this_cpu_ptr(bp_on_reg);
			
 
				 		max_slots = core_num_brps;
			
 
				 	} else {
			
 
				 		/* Watchpoint */
			
 
				 		ctrl_base = ARM_BASE_WCR;
			
 
				 		val_base = ARM_BASE_WVR;
			
 
				-		slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
			
 
				+		slots = this_cpu_ptr(wp_on_reg);
			
 
				 		max_slots = core_num_wrps;
			
 
				 	}
			
 
				 
			
@@ -396,12 +396,12 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
 
				 	if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) {
			
 
				 		/* Breakpoint */
			
 
				 		base = ARM_BASE_BCR;
			
 
				-		slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
			
 
				+		slots = this_cpu_ptr(bp_on_reg);
			
 
				 		max_slots = core_num_brps;
			
 
				 	} else {
			
 
				 		/* Watchpoint */
			
 
				 		base = ARM_BASE_WCR;
			
 
				-		slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
			
 
				+		slots = this_cpu_ptr(wp_on_reg);
			
 
				 		max_slots = core_num_wrps;
			
 
				 	}
			
 
				 
			
@@ -697,7 +697,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
 
				 	struct arch_hw_breakpoint *info;
			
 
				 	struct arch_hw_breakpoint_ctrl ctrl;
			
 
				 
			
 
				-	slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
			
 
				+	slots = this_cpu_ptr(wp_on_reg);
			
 
				 
			
 
				 	for (i = 0; i < core_num_wrps; ++i) {
			
 
				 		rcu_read_lock();
			
@@ -768,7 +768,7 @@ static void watchpoint_single_step_handler(unsigned long pc)
 
				 	struct perf_event *wp, **slots;
			
 
				 	struct arch_hw_breakpoint *info;
			
 
				 
			
 
				-	slots = (struct perf_event **)__get_cpu_var(wp_on_reg);
			
 
				+	slots = this_cpu_ptr(wp_on_reg);
			
 
				 
			
 
				 	for (i = 0; i < core_num_wrps; ++i) {
			
 
				 		rcu_read_lock();
			
@@ -802,7 +802,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
 
				 	struct arch_hw_breakpoint *info;
			
 
				 	struct arch_hw_breakpoint_ctrl ctrl;
			
 
				 
			
 
				-	slots = (struct perf_event **)__get_cpu_var(bp_on_reg);
			
 
				+	slots = this_cpu_ptr(bp_on_reg);
			
 
				 
			
 
				 	/* The exception entry code places the amended lr in the PC. */
			
 
				 	addr = regs->ARM_pc;
			
--- a/arch/arm/kernel/kprobes.c
+++ b/arch/arm/kernel/kprobes.c
@@ -171,13 +171,13 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
 
				 
			
 
				 static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
			
 
				 {
			
 
				-	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
			
 
				+	__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
			
 
				 	kcb->kprobe_status = kcb->prev_kprobe.status;
			
 
				 }
			
 
				 
			
 
				 static void __kprobes set_current_kprobe(struct kprobe *p)
			
 
				 {
			
 
				-	__get_cpu_var(current_kprobe) = p;
			
 
				+	__this_cpu_write(current_kprobe, p);
			
 
				 }
			
 
				 
			
 
				 static void __kprobes
			
@@ -421,10 +421,10 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 
				 			continue;
			
 
				 
			
 
				 		if (ri->rp && ri->rp->handler) {
			
 
				-			__get_cpu_var(current_kprobe) = &ri->rp->kp;
			
 
				+			__this_cpu_write(current_kprobe, &ri->rp->kp);
			
 
				 			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
			
 
				 			ri->rp->handler(ri, regs);
			
 
				-			__get_cpu_var(current_kprobe) = NULL;
			
 
				+			__this_cpu_write(current_kprobe, NULL);
			
 
				 		}
			
 
				 
			
 
				 		orig_ret_address = (unsigned long)ri->ret_addr;
			
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -24,6 +24,7 @@
 
				 #include <asm/sections.h>
			
 
				 #include <asm/smp_plat.h>
			
 
				 #include <asm/unwind.h>
			
 
				+#include <asm/opcodes.h>
			
 
				 
			
 
				 #ifdef CONFIG_XIP_KERNEL
			
 
				 /*
			
@@ -60,6 +61,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 
				 		Elf32_Sym *sym;
			
 
				 		const char *symname;
			
 
				 		s32 offset;
			
 
				+		u32 tmp;
			
 
				 #ifdef CONFIG_THUMB2_KERNEL
			
 
				 		u32 upper, lower, sign, j1, j2;
			
 
				 #endif
			
@@ -95,7 +97,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 
				 		case R_ARM_PC24:
			
 
				 		case R_ARM_CALL:
			
 
				 		case R_ARM_JUMP24:
			
 
				-			offset = (*(u32 *)loc & 0x00ffffff) << 2;
			
 
				+			offset = __mem_to_opcode_arm(*(u32 *)loc);
			
 
				+			offset = (offset & 0x00ffffff) << 2;
			
 
				 			if (offset & 0x02000000)
			
 
				 				offset -= 0x04000000;
			
 
				 
			
@@ -111,9 +114,10 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 
				 			}
			
 
				 
			
 
				 			offset >>= 2;
			
 
				+			offset &= 0x00ffffff;
			
 
				 
			
 
				-			*(u32 *)loc &= 0xff000000;
			
 
				-			*(u32 *)loc |= offset & 0x00ffffff;
			
 
				+			*(u32 *)loc &= __opcode_to_mem_arm(0xff000000);
			
 
				+			*(u32 *)loc |= __opcode_to_mem_arm(offset);
			
 
				 			break;
			
 
				 
			
 
				 	       case R_ARM_V4BX:
			
@@ -121,8 +125,8 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 
				 			* other bits to re-code instruction as
			
 
				 			* MOV PC,Rm.
			
 
				 			*/
			
 
				-		       *(u32 *)loc &= 0xf000000f;
			
 
				-		       *(u32 *)loc |= 0x01a0f000;
			
 
				+		       *(u32 *)loc &= __opcode_to_mem_arm(0xf000000f);
			
 
				+		       *(u32 *)loc |= __opcode_to_mem_arm(0x01a0f000);
			
 
				 		       break;
			
 
				 
			
 
				 		case R_ARM_PREL31:
			
@@ -132,7 +136,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 
				 
			
 
				 		case R_ARM_MOVW_ABS_NC:
			
 
				 		case R_ARM_MOVT_ABS:
			
 
				-			offset = *(u32 *)loc;
			
 
				+			offset = tmp = __mem_to_opcode_arm(*(u32 *)loc);
			
 
				 			offset = ((offset & 0xf0000) >> 4) | (offset & 0xfff);
			
 
				 			offset = (offset ^ 0x8000) - 0x8000;
			
 
				 
			
@@ -140,16 +144,18 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 
				 			if (ELF32_R_TYPE(rel->r_info) == R_ARM_MOVT_ABS)
			
 
				 				offset >>= 16;
			
 
				 
			
 
				-			*(u32 *)loc &= 0xfff0f000;
			
 
				-			*(u32 *)loc |= ((offset & 0xf000) << 4) |
			
 
				-					(offset & 0x0fff);
			
 
				+			tmp &= 0xfff0f000;
			
 
				+			tmp |= ((offset & 0xf000) << 4) |
			
 
				+				(offset & 0x0fff);
			
 
				+
			
 
				+			*(u32 *)loc = __opcode_to_mem_arm(tmp);
			
 
				 			break;
			
 
				 
			
 
				 #ifdef CONFIG_THUMB2_KERNEL
			
 
				 		case R_ARM_THM_CALL:
			
 
				 		case R_ARM_THM_JUMP24:
			
 
				-			upper = *(u16 *)loc;
			
 
				-			lower = *(u16 *)(loc + 2);
			
 
				+			upper = __mem_to_opcode_thumb16(*(u16 *)loc);
			
 
				+			lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
			
 
				 
			
 
				 			/*
			
 
				 			 * 25 bit signed address range (Thumb-2 BL and B.W
			
@@ -198,17 +204,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 
				 			sign = (offset >> 24) & 1;
			
 
				 			j1 = sign ^ (~(offset >> 23) & 1);
			
 
				 			j2 = sign ^ (~(offset >> 22) & 1);
			
 
				-			*(u16 *)loc = (u16)((upper & 0xf800) | (sign << 10) |
			
 
				+			upper = (u16)((upper & 0xf800) | (sign << 10) |
			
 
				 					    ((offset >> 12) & 0x03ff));
			
 
				-			*(u16 *)(loc + 2) = (u16)((lower & 0xd000) |
			
 
				-						  (j1 << 13) | (j2 << 11) |
			
 
				-						  ((offset >> 1) & 0x07ff));
			
 
				+			lower = (u16)((lower & 0xd000) |
			
 
				+				      (j1 << 13) | (j2 << 11) |
			
 
				+				      ((offset >> 1) & 0x07ff));
			
 
				+
			
 
				+			*(u16 *)loc = __opcode_to_mem_thumb16(upper);
			
 
				+			*(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
			
 
				 			break;
			
 
				 
			
 
				 		case R_ARM_THM_MOVW_ABS_NC:
			
 
				 		case R_ARM_THM_MOVT_ABS:
			
 
				-			upper = *(u16 *)loc;
			
 
				-			lower = *(u16 *)(loc + 2);
			
 
				+			upper = __mem_to_opcode_thumb16(*(u16 *)loc);
			
 
				+			lower = __mem_to_opcode_thumb16(*(u16 *)(loc + 2));
			
 
				 
			
 
				 			/*
			
 
				 			 * MOVT/MOVW instructions encoding in Thumb-2:
			
@@ -229,12 +238,14 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 
				 			if (ELF32_R_TYPE(rel->r_info) == R_ARM_THM_MOVT_ABS)
			
 
				 				offset >>= 16;
			
 
				 
			
 
				-			*(u16 *)loc = (u16)((upper & 0xfbf0) |
			
 
				-					    ((offset & 0xf000) >> 12) |
			
 
				-					    ((offset & 0x0800) >> 1));
			
 
				-			*(u16 *)(loc + 2) = (u16)((lower & 0x8f00) |
			
 
				-						  ((offset & 0x0700) << 4) |
			
 
				-						  (offset & 0x00ff));
			
 
				+			upper = (u16)((upper & 0xfbf0) |
			
 
				+				      ((offset & 0xf000) >> 12) |
			
 
				+				      ((offset & 0x0800) >> 1));
			
 
				+			lower = (u16)((lower & 0x8f00) |
			
 
				+				      ((offset & 0x0700) << 4) |
			
 
				+				      (offset & 0x00ff));
			
 
				+			*(u16 *)loc = __opcode_to_mem_thumb16(upper);
			
 
				+			*(u16 *)(loc + 2) = __opcode_to_mem_thumb16(lower);
			
 
				 			break;
			
 
				 #endif
			
 
				 
			
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -256,12 +256,11 @@ validate_event(struct pmu_hw_events *hw_events,
 
				 	       struct perf_event *event)
			
 
				 {
			
 
				 	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
			
 
				-	struct pmu *leader_pmu = event->group_leader->pmu;
			
 
				 
			
 
				 	if (is_software_event(event))
			
 
				 		return 1;
			
 
				 
			
 
				-	if (event->pmu != leader_pmu || event->state < PERF_EVENT_STATE_OFF)
			
 
				+	if (event->state < PERF_EVENT_STATE_OFF)
			
 
				 		return 1;
			
 
				 
			
 
				 	if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
			
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL_GPL(perf_num_counters);
 
				 
			
 
				 static struct pmu_hw_events *cpu_pmu_get_cpu_events(void)
			
 
				 {
			
 
				-	return &__get_cpu_var(cpu_hw_events);
			
 
				+	return this_cpu_ptr(&cpu_hw_events);
			
 
				 }
			
 
				 
			
 
				 static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
			
--- a/arch/arm/kernel/perf_regs.c
+++ b/arch/arm/kernel/perf_regs.c
@@ -0,0 +1,30 @@
 
				+
			
 
				+#include <linux/errno.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/perf_event.h>
			
 
				+#include <linux/bug.h>
			
 
				+#include <asm/perf_regs.h>
			
 
				+#include <asm/ptrace.h>
			
 
				+
			
 
				+u64 perf_reg_value(struct pt_regs *regs, int idx)
			
 
				+{
			
 
				+	if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM_MAX))
			
 
				+		return 0;
			
 
				+
			
 
				+	return regs->uregs[idx];
			
 
				+}
			
 
				+
			
 
				+#define REG_RESERVED (~((1ULL << PERF_REG_ARM_MAX) - 1))
			
 
				+
			
 
				+int perf_reg_validate(u64 mask)
			
 
				+{
			
 
				+	if (!mask || mask & REG_RESERVED)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+u64 perf_reg_abi(struct task_struct *task)
			
 
				+{
			
 
				+	return PERF_SAMPLE_REGS_ABI_32;
			
 
				+}
			
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -73,6 +73,8 @@ __setup("fpe=", fpe_setup);
 
				 #endif
			
 
				 
			
 
				 extern void paging_init(const struct machine_desc *desc);
			
 
				+extern void early_paging_init(const struct machine_desc *,
			
 
				+			      struct proc_info_list *);
			
 
				 extern void sanity_check_meminfo(void);
			
 
				 extern enum reboot_mode reboot_mode;
			
 
				 extern void setup_dma_zone(const struct machine_desc *desc);
			
@@ -599,6 +601,8 @@ static void __init setup_processor(void)
 
				 	elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT);
			
 
				 #endif
			
 
				 
			
 
				+	erratum_a15_798181_init();
			
 
				+
			
 
				 	feat_v6_fixup();
			
 
				 
			
 
				 	cacheid_init();
			
@@ -619,9 +623,10 @@ void __init dump_machine_table(void)
 
				 		/* can't use cpu_relax() here as it may require MMU setup */;
			
 
				 }
			
 
				 
			
 
				-int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
			
 
				+int __init arm_add_memory(u64 start, u64 size)
			
 
				 {
			
 
				 	struct membank *bank = &meminfo.bank[meminfo.nr_banks];
			
 
				+	u64 aligned_start;
			
 
				 
			
 
				 	if (meminfo.nr_banks >= NR_BANKS) {
			
 
				 		printk(KERN_CRIT "NR_BANKS too low, "
			
@@ -634,10 +639,16 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
 
				 	 * Size is appropriately rounded down, start is rounded up.
			
 
				 	 */
			
 
				 	size -= start & ~PAGE_MASK;
			
 
				-	bank->start = PAGE_ALIGN(start);
			
 
				+	aligned_start = PAGE_ALIGN(start);
			
 
				 
			
 
				-#ifndef CONFIG_ARM_LPAE
			
 
				-	if (bank->start + size < bank->start) {
			
 
				+#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BIT
			
 
				+	if (aligned_start > ULONG_MAX) {
			
 
				+		printk(KERN_CRIT "Ignoring memory at 0x%08llx outside "
			
 
				+		       "32-bit physical address space\n", (long long)start);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (aligned_start + size > ULONG_MAX) {
			
 
				 		printk(KERN_CRIT "Truncating memory at 0x%08llx to fit in "
			
 
				 			"32-bit physical address space\n", (long long)start);
			
 
				 		/*
			
@@ -645,10 +656,11 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
 
				 		 * 32 bits, we use ULONG_MAX as the upper limit rather than 4GB.
			
 
				 		 * This means we lose a page after masking.
			
 
				 		 */
			
 
				-		size = ULONG_MAX - bank->start;
			
 
				+		size = ULONG_MAX - aligned_start;
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				+	bank->start = aligned_start;
			
 
				 	bank->size = size & ~(phys_addr_t)(PAGE_SIZE - 1);
			
 
				 
			
 
				 	/*
			
@@ -669,8 +681,8 @@ int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
 
				 static int __init early_mem(char *p)
			
 
				 {
			
 
				 	static int usermem __initdata = 0;
			
 
				-	phys_addr_t size;
			
 
				-	phys_addr_t start;
			
 
				+	u64 size;
			
 
				+	u64 start;
			
 
				 	char *endp;
			
 
				 
			
 
				 	/*
			
@@ -878,6 +890,8 @@ void __init setup_arch(char **cmdline_p)
 
				 	parse_early_param();
			
 
				 
			
 
				 	sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL);
			
 
				+
			
 
				+	early_paging_init(mdesc, lookup_processor_type(read_cpuid_id()));
			
 
				 	sanity_check_meminfo();
			
 
				 	arm_memblock_init(&meminfo, mdesc);
			
 
				 
			
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -21,29 +21,7 @@
 
				 #include <asm/unistd.h>
			
 
				 #include <asm/vfp.h>
			
 
				 
			
 
				-/*
			
 
				- * For ARM syscalls, we encode the syscall number into the instruction.
			
 
				- */
			
 
				-#define SWI_SYS_SIGRETURN	(0xef000000|(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE))
			
 
				-#define SWI_SYS_RT_SIGRETURN	(0xef000000|(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE))
			
 
				-
			
 
				-/*
			
 
				- * With EABI, the syscall number has to be loaded into r7.
			
 
				- */
			
 
				-#define MOV_R7_NR_SIGRETURN	(0xe3a07000 | (__NR_sigreturn - __NR_SYSCALL_BASE))
			
 
				-#define MOV_R7_NR_RT_SIGRETURN	(0xe3a07000 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
			
 
				-
			
 
				-/*
			
 
				- * For Thumb syscalls, we pass the syscall number via r7.  We therefore
			
 
				- * need two 16-bit instructions.
			
 
				- */
			
 
				-#define SWI_THUMB_SIGRETURN	(0xdf00 << 16 | 0x2700 | (__NR_sigreturn - __NR_SYSCALL_BASE))
			
 
				-#define SWI_THUMB_RT_SIGRETURN	(0xdf00 << 16 | 0x2700 | (__NR_rt_sigreturn - __NR_SYSCALL_BASE))
			
 
				-
			
 
				-static const unsigned long sigreturn_codes[7] = {
			
 
				-	MOV_R7_NR_SIGRETURN,    SWI_SYS_SIGRETURN,    SWI_THUMB_SIGRETURN,
			
 
				-	MOV_R7_NR_RT_SIGRETURN, SWI_SYS_RT_SIGRETURN, SWI_THUMB_RT_SIGRETURN,
			
 
				-};
			
 
				+extern const unsigned long sigreturn_codes[7];
			
 
				 
			
 
				 static unsigned long signal_return_offset;
			
 
				 
			
@@ -375,12 +353,18 @@ setup_return(struct pt_regs *regs, struct ksignal *ksig,
 
				 		 */
			
 
				 		thumb = handler & 1;
			
 
				 
			
 
				-		if (thumb) {
			
 
				-			cpsr |= PSR_T_BIT;
			
 
				 #if __LINUX_ARM_ARCH__ >= 7
			
 
				-			/* clear the If-Then Thumb-2 execution state */
			
 
				-			cpsr &= ~PSR_IT_MASK;
			
 
				+		/*
			
 
				+		 * Clear the If-Then Thumb-2 execution state
			
 
				+		 * ARM spec requires this to be all 000s in ARM mode
			
 
				+		 * Snapdragon S4/Krait misbehaves on a Thumb=>ARM
			
 
				+		 * signal transition without this.
			
 
				+		 */
			
 
				+		cpsr &= ~PSR_IT_MASK;
			
 
				 #endif
			
 
				+
			
 
				+		if (thumb) {
			
 
				+			cpsr |= PSR_T_BIT;
			
 
				 		} else
			
 
				 			cpsr &= ~PSR_T_BIT;
			
 
				 	}
			
--- a/arch/arm/kernel/sigreturn_codes.S
+++ b/arch/arm/kernel/sigreturn_codes.S
@@ -0,0 +1,80 @@
 
				+/*
			
 
				+ * sigreturn_codes.S - code sinpets for sigreturn syscalls
			
 
				+ *
			
 
				+ * Created by:	Victor Kamensky, 2013-08-13
			
 
				+ * Copyright:	(C) 2013  Linaro Limited
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+ * GNU General Public License for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <asm/unistd.h>
			
 
				+
			
 
				+/*
			
 
				+ * For ARM syscalls, we encode the syscall number into the instruction.
			
 
				+ * With EABI, the syscall number has to be loaded into r7. As result
			
 
				+ * ARM syscall sequence snippet will have move and svc in .arm encoding
			
 
				+ *
			
 
				+ * For Thumb syscalls, we pass the syscall number via r7.  We therefore
			
 
				+ * need two 16-bit instructions in .thumb encoding
			
 
				+ *
			
 
				+ * Please note sigreturn_codes code are not executed in place. Instead
			
 
				+ * they just copied by kernel into appropriate places. Code inside of
			
 
				+ * arch/arm/kernel/signal.c is very sensitive to layout of these code
			
 
				+ * snippets.
			
 
				+ */
			
 
				+
			
 
				+#if __LINUX_ARM_ARCH__ <= 4
			
 
				+	/*
			
 
				+	 * Note we manually set minimally required arch that supports
			
 
				+	 * required thumb opcodes for early arch versions. It is OK
			
 
				+	 * for this file to be used in combination with other
			
 
				+	 * lower arch variants, since these code snippets are only
			
 
				+	 * used as input data.
			
 
				+	 */
			
 
				+	.arch armv4t
			
 
				+#endif
			
 
				+
			
 
				+	.section .rodata
			
 
				+	.global sigreturn_codes
			
 
				+	.type	sigreturn_codes, #object
			
 
				+
			
 
				+	.arm
			
 
				+
			
 
				+sigreturn_codes:
			
 
				+
			
 
				+	/* ARM sigreturn syscall code snippet */
			
 
				+	mov	r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
			
 
				+	swi	#(__NR_sigreturn)|(__NR_OABI_SYSCALL_BASE)
			
 
				+
			
 
				+	/* Thumb sigreturn syscall code snippet */
			
 
				+	.thumb
			
 
				+	movs	r7, #(__NR_sigreturn - __NR_SYSCALL_BASE)
			
 
				+	swi	#0
			
 
				+
			
 
				+	/* ARM sigreturn_rt syscall code snippet */
			
 
				+	.arm
			
 
				+	mov	r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
			
 
				+	swi	#(__NR_rt_sigreturn)|(__NR_OABI_SYSCALL_BASE)
			
 
				+
			
 
				+	/* Thumb sigreturn_rt syscall code snippet */
			
 
				+	.thumb
			
 
				+	movs	r7, #(__NR_rt_sigreturn - __NR_SYSCALL_BASE)
			
 
				+	swi	#0
			
 
				+
			
 
				+	/*
			
 
				+	 * Note on addtional space: setup_return in signal.c
			
 
				+	 * algorithm uses two words copy regardless whether
			
 
				+	 * it is thumb case or not, so we need additional
			
 
				+	 * word after real last entry.
			
 
				+	 */
			
 
				+	.arm
			
 
				+	.space	4
			
 
				+
			
 
				+	.size	sigreturn_codes, . - sigreturn_codes
			
--- a/arch/arm/kernel/sleep.S
+++ b/arch/arm/kernel/sleep.S
@@ -55,6 +55,7 @@
 
				  * specific registers and some other data for resume.
			
 
				  *  r0 = suspend function arg0
			
 
				  *  r1 = suspend function
			
 
				+ *  r2 = MPIDR value the resuming CPU will use
			
 
				  */
			
 
				 ENTRY(__cpu_suspend)
			
 
				 	stmfd	sp!, {r4 - r11, lr}
			
@@ -67,23 +68,18 @@ ENTRY(__cpu_suspend)
 
				 	mov	r5, sp			@ current virtual SP
			
 
				 	add	r4, r4, #12		@ Space for pgd, virt sp, phys resume fn
			
 
				 	sub	sp, sp, r4		@ allocate CPU state on stack
			
 
				-	stmfd	sp!, {r0, r1}		@ save suspend func arg and pointer
			
 
				-	add	r0, sp, #8		@ save pointer to save block
			
 
				-	mov	r1, r4			@ size of save block
			
 
				-	mov	r2, r5			@ virtual SP
			
 
				 	ldr	r3, =sleep_save_sp
			
 
				+	stmfd	sp!, {r0, r1}		@ save suspend func arg and pointer
			
 
				 	ldr	r3, [r3, #SLEEP_SAVE_SP_VIRT]
			
 
				-	ALT_SMP(mrc p15, 0, r9, c0, c0, 5)
			
 
				-        ALT_UP_B(1f)
			
 
				-	ldr	r8, =mpidr_hash
			
 
				-	/*
			
 
				-	 * This ldmia relies on the memory layout of the mpidr_hash
			
 
				-	 * struct mpidr_hash.
			
 
				-	 */
			
 
				-	ldmia	r8, {r4-r7}	@ r4 = mpidr mask (r5,r6,r7) = l[0,1,2] shifts
			
 
				-	compute_mpidr_hash	lr, r5, r6, r7, r9, r4
			
 
				-	add	r3, r3, lr, lsl #2
			
 
				-1:
			
 
				+	ALT_SMP(ldr r0, =mpidr_hash)
			
 
				+	ALT_UP_B(1f)
			
 
				+	/* This ldmia relies on the memory layout of the mpidr_hash struct */
			
 
				+	ldmia	r0, {r1, r6-r8}	@ r1 = mpidr mask (r6,r7,r8) = l[0,1,2] shifts
			
 
				+	compute_mpidr_hash	r0, r6, r7, r8, r2, r1
			
 
				+	add	r3, r3, r0, lsl #2
			
 
				+1:	mov	r2, r5			@ virtual SP
			
 
				+	mov	r1, r4			@ size of save block
			
 
				+	add	r0, sp, #8		@ pointer to save block
			
 
				 	bl	__cpu_suspend_save
			
 
				 	adr	lr, BSYM(cpu_suspend_abort)
			
 
				 	ldmfd	sp!, {r0, pc}		@ call suspend fn
			
@@ -130,6 +126,7 @@ ENDPROC(cpu_resume_after_mmu)
 
				 	.data
			
 
				 	.align
			
 
				 ENTRY(cpu_resume)
			
 
				+ARM_BE8(setend be)			@ ensure we are in BE mode
			
 
				 	mov	r1, #0
			
 
				 	ALT_SMP(mrc p15, 0, r0, c0, c0, 5)
			
 
				 	ALT_UP_B(1f)
			
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -25,6 +25,7 @@
 
				 #include <linux/clockchips.h>
			
 
				 #include <linux/completion.h>
			
 
				 #include <linux/cpufreq.h>
			
 
				+#include <linux/irq_work.h>
			
 
				 
			
 
				 #include <linux/atomic.h>
			
 
				 #include <asm/smp.h>
			
@@ -66,6 +67,8 @@ enum ipi_msg_type {
 
				 	IPI_CALL_FUNC,
			
 
				 	IPI_CALL_FUNC_SINGLE,
			
 
				 	IPI_CPU_STOP,
			
 
				+	IPI_IRQ_WORK,
			
 
				+	IPI_COMPLETION,
			
 
				 };
			
 
				 
			
 
				 static DECLARE_COMPLETION(cpu_running);
			
@@ -80,7 +83,7 @@ void __init smp_set_ops(struct smp_operations *ops)
 
				 
			
 
				 static unsigned long get_arch_pgd(pgd_t *pgd)
			
 
				 {
			
 
				-	phys_addr_t pgdir = virt_to_phys(pgd);
			
 
				+	phys_addr_t pgdir = virt_to_idmap(pgd);
			
 
				 	BUG_ON(pgdir & ARCH_PGD_MASK);
			
 
				 	return pgdir >> ARCH_PGD_SHIFT;
			
 
				 }
			
@@ -448,6 +451,14 @@ void arch_send_call_function_single_ipi(int cpu)
 
				 	smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_IRQ_WORK
			
 
				+void arch_irq_work_raise(void)
			
 
				+{
			
 
				+	if (is_smp())
			
 
				+		smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 static const char *ipi_types[NR_IPI] = {
			
 
				 #define S(x,s)	[x] = s
			
 
				 	S(IPI_WAKEUP, "CPU wakeup interrupts"),
			
@@ -456,6 +467,8 @@ static const char *ipi_types[NR_IPI] = {
 
				 	S(IPI_CALL_FUNC, "Function call interrupts"),
			
 
				 	S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"),
			
 
				 	S(IPI_CPU_STOP, "CPU stop interrupts"),
			
 
				+	S(IPI_IRQ_WORK, "IRQ work interrupts"),
			
 
				+	S(IPI_COMPLETION, "completion interrupts"),
			
 
				 };
			
 
				 
			
 
				 void show_ipi_list(struct seq_file *p, int prec)
			
@@ -515,6 +528,19 @@ static void ipi_cpu_stop(unsigned int cpu)
 
				 		cpu_relax();
			
 
				 }
			
 
				 
			
 
				+static DEFINE_PER_CPU(struct completion *, cpu_completion);
			
 
				+
			
 
				+int register_ipi_completion(struct completion *completion, int cpu)
			
 
				+{
			
 
				+	per_cpu(cpu_completion, cpu) = completion;
			
 
				+	return IPI_COMPLETION;
			
 
				+}
			
 
				+
			
 
				+static void ipi_complete(unsigned int cpu)
			
 
				+{
			
 
				+	complete(per_cpu(cpu_completion, cpu));
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Main handler for inter-processor interrupts
			
 
				  */
			
@@ -565,6 +591,20 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 
				 		irq_exit();
			
 
				 		break;
			
 
				 
			
 
				+#ifdef CONFIG_IRQ_WORK
			
 
				+	case IPI_IRQ_WORK:
			
 
				+		irq_enter();
			
 
				+		irq_work_run();
			
 
				+		irq_exit();
			
 
				+		break;
			
 
				+#endif
			
 
				+
			
 
				+	case IPI_COMPLETION:
			
 
				+		irq_enter();
			
 
				+		ipi_complete(cpu);
			
 
				+		irq_exit();
			
 
				+		break;
			
 
				+
			
 
				 	default:
			
 
				 		printk(KERN_CRIT "CPU%u: Unknown IPI message 0x%x\n",
			
 
				 		       cpu, ipinr);
			
--- a/arch/arm/kernel/smp_scu.c
+++ b/arch/arm/kernel/smp_scu.c
@@ -28,7 +28,7 @@
 
				  */
			
 
				 unsigned int __init scu_get_core_count(void __iomem *scu_base)
			
 
				 {
			
 
				-	unsigned int ncores = __raw_readl(scu_base + SCU_CONFIG);
			
 
				+	unsigned int ncores = readl_relaxed(scu_base + SCU_CONFIG);
			
 
				 	return (ncores & 0x03) + 1;
			
 
				 }
			
 
				 
			
@@ -42,19 +42,19 @@ void scu_enable(void __iomem *scu_base)
 
				 #ifdef CONFIG_ARM_ERRATA_764369
			
 
				 	/* Cortex-A9 only */
			
 
				 	if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090) {
			
 
				-		scu_ctrl = __raw_readl(scu_base + 0x30);
			
 
				+		scu_ctrl = readl_relaxed(scu_base + 0x30);
			
 
				 		if (!(scu_ctrl & 1))
			
 
				-			__raw_writel(scu_ctrl | 0x1, scu_base + 0x30);
			
 
				+			writel_relaxed(scu_ctrl | 0x1, scu_base + 0x30);
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	scu_ctrl = __raw_readl(scu_base + SCU_CTRL);
			
 
				+	scu_ctrl = readl_relaxed(scu_base + SCU_CTRL);
			
 
				 	/* already enabled? */
			
 
				 	if (scu_ctrl & 1)
			
 
				 		return;
			
 
				 
			
 
				 	scu_ctrl |= 1;
			
 
				-	__raw_writel(scu_ctrl, scu_base + SCU_CTRL);
			
 
				+	writel_relaxed(scu_ctrl, scu_base + SCU_CTRL);
			
 
				 
			
 
				 	/*
			
 
				 	 * Ensure that the data accessed by CPU0 before the SCU was
			
@@ -80,9 +80,9 @@ int scu_power_mode(void __iomem *scu_base, unsigned int mode)
 
				 	if (mode > 3 || mode == 1 || cpu > 3)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	val = __raw_readb(scu_base + SCU_CPU_STATUS + cpu) & ~0x03;
			
 
				+	val = readb_relaxed(scu_base + SCU_CPU_STATUS + cpu) & ~0x03;
			
 
				 	val |= mode;
			
 
				-	__raw_writeb(val, scu_base + SCU_CPU_STATUS + cpu);
			
 
				+	writeb_relaxed(val, scu_base + SCU_CPU_STATUS + cpu);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/arch/arm/kernel/smp_tlb.c
+++ b/arch/arm/kernel/smp_tlb.c
@@ -70,6 +70,40 @@ static inline void ipi_flush_bp_all(void *ignored)
 
				 	local_flush_bp_all();
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_ARM_ERRATA_798181
			
 
				+bool (*erratum_a15_798181_handler)(void);
			
 
				+
			
 
				+static bool erratum_a15_798181_partial(void)
			
 
				+{
			
 
				+	asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
			
 
				+	dsb(ish);
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static bool erratum_a15_798181_broadcast(void)
			
 
				+{
			
 
				+	asm("mcr p15, 0, %0, c8, c3, 1" : : "r" (0));
			
 
				+	dsb(ish);
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+void erratum_a15_798181_init(void)
			
 
				+{
			
 
				+	unsigned int midr = read_cpuid_id();
			
 
				+	unsigned int revidr = read_cpuid(CPUID_REVIDR);
			
 
				+
			
 
				+	/* Cortex-A15 r0p0..r3p2 w/o ECO fix affected */
			
 
				+	if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2 ||
			
 
				+	    (revidr & 0x210) == 0x210) {
			
 
				+		return;
			
 
				+	}
			
 
				+	if (revidr & 0x10)
			
 
				+		erratum_a15_798181_handler = erratum_a15_798181_partial;
			
 
				+	else
			
 
				+		erratum_a15_798181_handler = erratum_a15_798181_broadcast;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 static void ipi_flush_tlb_a15_erratum(void *arg)
			
 
				 {
			
 
				 	dmb();
			
@@ -80,7 +114,6 @@ static void broadcast_tlb_a15_erratum(void)
 
				 	if (!erratum_a15_798181())
			
 
				 		return;
			
 
				 
			
 
				-	dummy_flush_tlb_a15_erratum();
			
 
				 	smp_call_function(ipi_flush_tlb_a15_erratum, NULL, 1);
			
 
				 }
			
 
				 
			
@@ -92,7 +125,6 @@ static void broadcast_tlb_mm_a15_erratum(struct mm_struct *mm)
 
				 	if (!erratum_a15_798181())
			
 
				 		return;
			
 
				 
			
 
				-	dummy_flush_tlb_a15_erratum();
			
 
				 	this_cpu = get_cpu();
			
 
				 	a15_erratum_get_cpumask(this_cpu, mm, &mask);
			
 
				 	smp_call_function_many(&mask, ipi_flush_tlb_a15_erratum, NULL, 1);
			
--- a/arch/arm/kernel/smp_twd.c
+++ b/arch/arm/kernel/smp_twd.c
@@ -45,7 +45,7 @@ static void twd_set_mode(enum clock_event_mode mode,
 
				 	case CLOCK_EVT_MODE_PERIODIC:
			
 
				 		ctrl = TWD_TIMER_CONTROL_ENABLE | TWD_TIMER_CONTROL_IT_ENABLE
			
 
				 			| TWD_TIMER_CONTROL_PERIODIC;
			
 
				-		__raw_writel(DIV_ROUND_CLOSEST(twd_timer_rate, HZ),
			
 
				+		writel_relaxed(DIV_ROUND_CLOSEST(twd_timer_rate, HZ),
			
 
				 			twd_base + TWD_TIMER_LOAD);
			
 
				 		break;
			
 
				 	case CLOCK_EVT_MODE_ONESHOT:
			
@@ -58,18 +58,18 @@ static void twd_set_mode(enum clock_event_mode mode,
 
				 		ctrl = 0;
			
 
				 	}
			
 
				 
			
 
				-	__raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL);
			
 
				+	writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
			
 
				 }
			
 
				 
			
 
				 static int twd_set_next_event(unsigned long evt,
			
 
				 			struct clock_event_device *unused)
			
 
				 {
			
 
				-	unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL);
			
 
				+	unsigned long ctrl = readl_relaxed(twd_base + TWD_TIMER_CONTROL);
			
 
				 
			
 
				 	ctrl |= TWD_TIMER_CONTROL_ENABLE;
			
 
				 
			
 
				-	__raw_writel(evt, twd_base + TWD_TIMER_COUNTER);
			
 
				-	__raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL);
			
 
				+	writel_relaxed(evt, twd_base + TWD_TIMER_COUNTER);
			
 
				+	writel_relaxed(ctrl, twd_base + TWD_TIMER_CONTROL);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -82,8 +82,8 @@ static int twd_set_next_event(unsigned long evt,
 
				  */
			
 
				 static int twd_timer_ack(void)
			
 
				 {
			
 
				-	if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) {
			
 
				-		__raw_writel(1, twd_base + TWD_TIMER_INTSTAT);
			
 
				+	if (readl_relaxed(twd_base + TWD_TIMER_INTSTAT)) {
			
 
				+		writel_relaxed(1, twd_base + TWD_TIMER_INTSTAT);
			
 
				 		return 1;
			
 
				 	}
			
 
				 
			
@@ -211,15 +211,15 @@ static void twd_calibrate_rate(void)
 
				 		waitjiffies += 5;
			
 
				 
			
 
				 				 /* enable, no interrupt or reload */
			
 
				-		__raw_writel(0x1, twd_base + TWD_TIMER_CONTROL);
			
 
				+		writel_relaxed(0x1, twd_base + TWD_TIMER_CONTROL);
			
 
				 
			
 
				 				 /* maximum value */
			
 
				-		__raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
			
 
				+		writel_relaxed(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
			
 
				 
			
 
				 		while (get_jiffies_64() < waitjiffies)
			
 
				 			udelay(10);
			
 
				 
			
 
				-		count = __raw_readl(twd_base + TWD_TIMER_COUNTER);
			
 
				+		count = readl_relaxed(twd_base + TWD_TIMER_COUNTER);
			
 
				 
			
 
				 		twd_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5);
			
 
				 
			
@@ -277,7 +277,7 @@ static void twd_timer_setup(void)
 
				 	 * bother with the below.
			
 
				 	 */
			
 
				 	if (per_cpu(percpu_setup_called, cpu)) {
			
 
				-		__raw_writel(0, twd_base + TWD_TIMER_CONTROL);
			
 
				+		writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
			
 
				 		clockevents_register_device(clk);
			
 
				 		enable_percpu_irq(clk->irq, 0);
			
 
				 		return;
			
@@ -290,7 +290,7 @@ static void twd_timer_setup(void)
 
				 	 * The following is done once per CPU the first time .setup() is
			
 
				 	 * called.
			
 
				 	 */
			
 
				-	__raw_writel(0, twd_base + TWD_TIMER_CONTROL);
			
 
				+	writel_relaxed(0, twd_base + TWD_TIMER_CONTROL);
			
 
				 
			
 
				 	clk->name = "local_timer";
			
 
				 	clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT |
			
--- a/arch/arm/kernel/suspend.c
+++ b/arch/arm/kernel/suspend.c
@@ -10,7 +10,7 @@
 
				 #include <asm/suspend.h>
			
 
				 #include <asm/tlbflush.h>
			
 
				 
			
 
				-extern int __cpu_suspend(unsigned long, int (*)(unsigned long));
			
 
				+extern int __cpu_suspend(unsigned long, int (*)(unsigned long), u32 cpuid);
			
 
				 extern void cpu_resume_mmu(void);
			
 
				 
			
 
				 #ifdef CONFIG_MMU
			
@@ -21,6 +21,7 @@ extern void cpu_resume_mmu(void);
 
				 int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
			
 
				 {
			
 
				 	struct mm_struct *mm = current->active_mm;
			
 
				+	u32 __mpidr = cpu_logical_map(smp_processor_id());
			
 
				 	int ret;
			
 
				 
			
 
				 	if (!idmap_pgd)
			
@@ -32,7 +33,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 
				 	 * resume (indicated by a zero return code), we need to switch
			
 
				 	 * back to the correct page tables.
			
 
				 	 */
			
 
				-	ret = __cpu_suspend(arg, fn);
			
 
				+	ret = __cpu_suspend(arg, fn, __mpidr);
			
 
				 	if (ret == 0) {
			
 
				 		cpu_switch_mm(mm->pgd, mm);
			
 
				 		local_flush_bp_all();
			
@@ -44,7 +45,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 
				 #else
			
 
				 int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
			
 
				 {
			
 
				-	return __cpu_suspend(arg, fn);
			
 
				+	u32 __mpidr = cpu_logical_map(smp_processor_id());
			
 
				+	return __cpu_suspend(arg, fn, __mpidr);
			
 
				 }
			
 
				 #define	idmap_pgd	NULL
			
 
				 #endif
			
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -34,6 +34,7 @@
 
				 #include <asm/unwind.h>
			
 
				 #include <asm/tls.h>
			
 
				 #include <asm/system_misc.h>
			
 
				+#include <asm/opcodes.h>
			
 
				 
			
 
				 static const char *handler[]= { "prefetch abort", "data abort", "address exception", "interrupt" };
			
 
				 
			
@@ -341,15 +342,17 @@ void arm_notify_die(const char *str, struct pt_regs *regs,
 
				 int is_valid_bugaddr(unsigned long pc)
			
 
				 {
			
 
				 #ifdef CONFIG_THUMB2_KERNEL
			
 
				-	unsigned short bkpt;
			
 
				+	u16 bkpt;
			
 
				+	u16 insn = __opcode_to_mem_thumb16(BUG_INSTR_VALUE);
			
 
				 #else
			
 
				-	unsigned long bkpt;
			
 
				+	u32 bkpt;
			
 
				+	u32 insn = __opcode_to_mem_arm(BUG_INSTR_VALUE);
			
 
				 #endif
			
 
				 
			
 
				 	if (probe_kernel_address((unsigned *)pc, bkpt))
			
 
				 		return 0;
			
 
				 
			
 
				-	return bkpt == BUG_INSTR_VALUE;
			
 
				+	return bkpt == insn;
			
 
				 }
			
 
				 
			
 
				 #endif
			
@@ -402,25 +405,28 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
 
				 	if (processor_mode(regs) == SVC_MODE) {
			
 
				 #ifdef CONFIG_THUMB2_KERNEL
			
 
				 		if (thumb_mode(regs)) {
			
 
				-			instr = ((u16 *)pc)[0];
			
 
				+			instr = __mem_to_opcode_thumb16(((u16 *)pc)[0]);
			
 
				 			if (is_wide_instruction(instr)) {
			
 
				-				instr <<= 16;
			
 
				-				instr |= ((u16 *)pc)[1];
			
 
				+				u16 inst2;
			
 
				+				inst2 = __mem_to_opcode_thumb16(((u16 *)pc)[1]);
			
 
				+				instr = __opcode_thumb32_compose(instr, inst2);
			
 
				 			}
			
 
				 		} else
			
 
				 #endif
			
 
				-			instr = *(u32 *) pc;
			
 
				+			instr = __mem_to_opcode_arm(*(u32 *) pc);
			
 
				 	} else if (thumb_mode(regs)) {
			
 
				 		if (get_user(instr, (u16 __user *)pc))
			
 
				 			goto die_sig;
			
 
				+		instr = __mem_to_opcode_thumb16(instr);
			
 
				 		if (is_wide_instruction(instr)) {
			
 
				 			unsigned int instr2;
			
 
				 			if (get_user(instr2, (u16 __user *)pc+1))
			
 
				 				goto die_sig;
			
 
				-			instr <<= 16;
			
 
				-			instr |= instr2;
			
 
				+			instr2 = __mem_to_opcode_thumb16(instr2);
			
 
				+			instr = __opcode_thumb32_compose(instr, instr2);
			
 
				 		}
			
 
				 	} else if (get_user(instr, (u32 __user *)pc)) {
			
 
				+		instr = __mem_to_opcode_arm(instr);
			
 
				 		goto die_sig;
			
 
				 	}
			
 
				 
			
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -65,7 +65,7 @@ static bool vgic_present;
 
				 static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				 	BUG_ON(preemptible());
			
 
				-	__get_cpu_var(kvm_arm_running_vcpu) = vcpu;
			
 
				+	__this_cpu_write(kvm_arm_running_vcpu, vcpu);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -75,7 +75,7 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu)
 
				 struct kvm_vcpu *kvm_arm_get_running_vcpu(void)
			
 
				 {
			
 
				 	BUG_ON(preemptible());
			
 
				-	return __get_cpu_var(kvm_arm_running_vcpu);
			
 
				+	return __this_cpu_read(kvm_arm_running_vcpu);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -815,7 +815,7 @@ static void cpu_init_hyp_mode(void *dummy)
 
				 
			
 
				 	boot_pgd_ptr = kvm_mmu_get_boot_httbr();
			
 
				 	pgd_ptr = kvm_mmu_get_httbr();
			
 
				-	stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
			
 
				+	stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
			
 
				 	hyp_stack_ptr = stack_page + PAGE_SIZE;
			
 
				 	vector_ptr = (unsigned long)__kvm_hyp_vector;
			
 
				 
			
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -10,6 +10,11 @@ UNWIND(	.fnstart	)
 
				 	and	r3, r0, #31		@ Get bit offset
			
 
				 	mov	r0, r0, lsr #5
			
 
				 	add	r1, r1, r0, lsl #2	@ Get word offset
			
 
				+#if __LINUX_ARM_ARCH__ >= 7
			
 
				+	.arch_extension	mp
			
 
				+	ALT_SMP(W(pldw)	[r1])
			
 
				+	ALT_UP(W(nop))
			
 
				+#endif
			
 
				 	mov	r3, r2, lsl r3
			
 
				 1:	ldrex	r2, [r1]
			
 
				 	\instr	r2, r2, r3
			
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -18,6 +18,7 @@
 
				 #include <linux/hardirq.h> /* for in_atomic() */
			
 
				 #include <linux/gfp.h>
			
 
				 #include <linux/highmem.h>
			
 
				+#include <linux/hugetlb.h>
			
 
				 #include <asm/current.h>
			
 
				 #include <asm/page.h>
			
 
				 
			
@@ -40,7 +41,35 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
 
				 		return 0;
			
 
				 
			
 
				 	pmd = pmd_offset(pud, addr);
			
 
				-	if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
			
 
				+	if (unlikely(pmd_none(*pmd)))
			
 
				+		return 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * A pmd can be bad if it refers to a HugeTLB or THP page.
			
 
				+	 *
			
 
				+	 * Both THP and HugeTLB pages have the same pmd layout
			
 
				+	 * and should not be manipulated by the pte functions.
			
 
				+	 *
			
 
				+	 * Lock the page table for the destination and check
			
 
				+	 * to see that it's still huge and whether or not we will
			
 
				+	 * need to fault on write, or if we have a splitting THP.
			
 
				+	 */
			
 
				+	if (unlikely(pmd_thp_or_huge(*pmd))) {
			
 
				+		ptl = &current->mm->page_table_lock;
			
 
				+		spin_lock(ptl);
			
 
				+		if (unlikely(!pmd_thp_or_huge(*pmd)
			
 
				+			|| pmd_hugewillfault(*pmd)
			
 
				+			|| pmd_trans_splitting(*pmd))) {
			
 
				+			spin_unlock(ptl);
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		*ptep = NULL;
			
 
				+		*ptlp = ptl;
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(pmd_bad(*pmd)))
			
 
				 		return 0;
			
 
				 
			
 
				 	pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
			
@@ -94,7 +123,10 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
 
				 		from += tocopy;
			
 
				 		n -= tocopy;
			
 
				 
			
 
				-		pte_unmap_unlock(pte, ptl);
			
 
				+		if (pte)
			
 
				+			pte_unmap_unlock(pte, ptl);
			
 
				+		else
			
 
				+			spin_unlock(ptl);
			
 
				 	}
			
 
				 	if (!atomic)
			
 
				 		up_read(&current->mm->mmap_sem);
			
@@ -147,7 +179,10 @@ __clear_user_memset(void __user *addr, unsigned long n)
 
				 		addr += tocopy;
			
 
				 		n -= tocopy;
			
 
				 
			
 
				-		pte_unmap_unlock(pte, ptl);
			
 
				+		if (pte)
			
 
				+			pte_unmap_unlock(pte, ptl);
			
 
				+		else
			
 
				+			spin_unlock(ptl);
			
 
				 	}
			
 
				 	up_read(&current->mm->mmap_sem);
			
 
				 
			
--- a/arch/arm/mach-footbridge/netwinder-hw.c
+++ b/arch/arm/mach-footbridge/netwinder-hw.c
@@ -692,14 +692,14 @@ static void netwinder_led_set(struct led_classdev *cdev,
 
				 	unsigned long flags;
			
 
				 	u32 reg;
			
 
				 
			
 
				-	spin_lock_irqsave(&nw_gpio_lock, flags);
			
 
				+	raw_spin_lock_irqsave(&nw_gpio_lock, flags);
			
 
				 	reg = nw_gpio_read();
			
 
				 	if (b != LED_OFF)
			
 
				 		reg &= ~led->mask;
			
 
				 	else
			
 
				 		reg |= led->mask;
			
 
				 	nw_gpio_modify_op(led->mask, reg);
			
 
				-	spin_unlock_irqrestore(&nw_gpio_lock, flags);
			
 
				+	raw_spin_unlock_irqrestore(&nw_gpio_lock, flags);
			
 
				 }
			
 
				 
			
 
				 static enum led_brightness netwinder_led_get(struct led_classdev *cdev)
			
@@ -709,9 +709,9 @@ static enum led_brightness netwinder_led_get(struct led_classdev *cdev)
 
				 	unsigned long flags;
			
 
				 	u32 reg;
			
 
				 
			
 
				-	spin_lock_irqsave(&nw_gpio_lock, flags);
			
 
				+	raw_spin_lock_irqsave(&nw_gpio_lock, flags);
			
 
				 	reg = nw_gpio_read();
			
 
				-	spin_unlock_irqrestore(&nw_gpio_lock, flags);
			
 
				+	raw_spin_unlock_irqrestore(&nw_gpio_lock, flags);
			
 
				 
			
 
				 	return (reg & led->mask) ? LED_OFF : LED_FULL;
			
 
				 }
			
--- a/arch/arm/mach-highbank/Kconfig
+++ b/arch/arm/mach-highbank/Kconfig
@@ -4,11 +4,12 @@ config ARCH_HIGHBANK
 
				 	select ARCH_HAS_CPUFREQ
			
 
				 	select ARCH_HAS_HOLES_MEMORYMODEL
			
 
				 	select ARCH_HAS_OPP
			
 
				+	select ARCH_SUPPORTS_BIG_ENDIAN
			
 
				 	select ARCH_WANT_OPTIONAL_GPIOLIB
			
 
				 	select ARM_AMBA
			
 
				 	select ARM_ERRATA_764369
			
 
				 	select ARM_ERRATA_775420
			
 
				-	select ARM_ERRATA_798181
			
 
				+	select ARM_ERRATA_798181 if SMP
			
 
				 	select ARM_GIC
			
 
				 	select ARM_PSCI
			
 
				 	select ARM_TIMER_SP804
			
--- a/arch/arm/mach-ixp4xx/Kconfig
+++ b/arch/arm/mach-ixp4xx/Kconfig
@@ -1,9 +1,5 @@
 
				 if ARCH_IXP4XX
			
 
				 
			
 
				-config ARCH_SUPPORTS_BIG_ENDIAN
			
 
				-	bool
			
 
				-	default y
			
 
				-
			
 
				 menu "Intel IXP4xx Implementation Options"
			
 
				 
			
 
				 comment "IXP4xx Platforms"
			
--- a/arch/arm/mach-mvebu/Kconfig
+++ b/arch/arm/mach-mvebu/Kconfig
@@ -1,5 +1,6 @@
 
				 config ARCH_MVEBU
			
 
				 	bool "Marvell SOCs with Device Tree support" if ARCH_MULTI_V7
			
 
				+	select ARCH_SUPPORTS_BIG_ENDIAN
			
 
				 	select CLKSRC_MMIO
			
 
				 	select COMMON_CLK
			
 
				 	select GENERIC_CLOCKEVENTS
			
--- a/arch/arm/mach-mvebu/coherency_ll.S
+++ b/arch/arm/mach-mvebu/coherency_ll.S
@@ -20,6 +20,8 @@
 
				 #define ARMADA_XP_CFB_CTL_REG_OFFSET 0x0
			
 
				 #define ARMADA_XP_CFB_CFG_REG_OFFSET 0x4
			
 
				 
			
 
				+#include <asm/assembler.h>
			
 
				+
			
 
				 	.text
			
 
				 /*
			
 
				  * r0: Coherency fabric base register address
			
@@ -29,6 +31,7 @@ ENTRY(ll_set_cpu_coherent)
 
				 	/* Create bit by cpu index */
			
 
				 	mov	r3, #(1 << 24)
			
 
				 	lsl	r1, r3, r1
			
 
				+ARM_BE8(rev	r1, r1)
			
 
				 
			
 
				 	/* Add CPU to SMP group - Atomic */
			
 
				 	add	r3, r0, #ARMADA_XP_CFB_CTL_REG_OFFSET
			
--- a/arch/arm/mach-mvebu/headsmp.S
+++ b/arch/arm/mach-mvebu/headsmp.S
@@ -21,12 +21,16 @@
 
				 #include <linux/linkage.h>
			
 
				 #include <linux/init.h>
			
 
				 
			
 
				+#include <asm/assembler.h>
			
 
				+
			
 
				 /*
			
 
				  * Armada XP specific entry point for secondary CPUs.
			
 
				  * We add the CPU to the coherency fabric and then jump to secondary
			
 
				  * startup
			
 
				  */
			
 
				 ENTRY(armada_xp_secondary_startup)
			
 
				+ ARM_BE8(setend	be )			@ go BE8 if entered LE
			
 
				+
			
 
				 	/* Get coherency fabric base physical address */
			
 
				 	adr	r0, 1f
			
 
				 	ldr	r1, [r0]
			
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -512,6 +512,9 @@ static void __init assabet_map_io(void)
 
				 	 * Its called GPCLKR0 in my SA1110 manual.
			
 
				 	 */
			
 
				 	Ser1SDCR0 |= SDCR0_SUS;
			
 
				+	MSC1 = (MSC1 & ~0xffff) |
			
 
				+		MSC_NonBrst | MSC_32BitStMem |
			
 
				+		MSC_RdAcc(2) | MSC_WrAcc(2) | MSC_Rec(0);
			
 
				 
			
 
				 	if (!machine_has_neponset())
			
 
				 		sa1100_register_uart_fns(&assabet_port_fns);
			
--- a/arch/arm/mach-sa1100/include/mach/gpio.h
+++ b/arch/arm/mach-sa1100/include/mach/gpio.h
@@ -1,55 +0,0 @@
 
				-/*
			
 
				- * arch/arm/mach-sa1100/include/mach/gpio.h
			
 
				- *
			
 
				- * SA1100 GPIO wrappers for arch-neutral GPIO calls
			
 
				- *
			
 
				- * Written by Philipp Zabel <philipp.zabel@gmail.com>
			
 
				- *
			
 
				- * This program is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU General Public License as published by
			
 
				- * the Free Software Foundation; either version 2 of the License, or
			
 
				- * (at your option) any later version.
			
 
				- *
			
 
				- * This program is distributed in the hope that it will be useful,
			
 
				- * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
			
 
				- * GNU General Public License for more details.
			
 
				- *
			
 
				- * You should have received a copy of the GNU General Public License
			
 
				- * along with this program; if not, write to the Free Software
			
 
				- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
			
 
				- *
			
 
				- */
			
 
				-
			
 
				-#ifndef __ASM_ARCH_SA1100_GPIO_H
			
 
				-#define __ASM_ARCH_SA1100_GPIO_H
			
 
				-
			
 
				-#include <linux/io.h>
			
 
				-#include <mach/hardware.h>
			
 
				-#include <asm/irq.h>
			
 
				-#include <asm-generic/gpio.h>
			
 
				-
			
 
				-#define __ARM_GPIOLIB_COMPLEX
			
 
				-
			
 
				-static inline int gpio_get_value(unsigned gpio)
			
 
				-{
			
 
				-	if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
			
 
				-		return GPLR & GPIO_GPIO(gpio);
			
 
				-	else
			
 
				-		return __gpio_get_value(gpio);
			
 
				-}
			
 
				-
			
 
				-static inline void gpio_set_value(unsigned gpio, int value)
			
 
				-{
			
 
				-	if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
			
 
				-		if (value)
			
 
				-			GPSR = GPIO_GPIO(gpio);
			
 
				-		else
			
 
				-			GPCR = GPIO_GPIO(gpio);
			
 
				-	else
			
 
				-		__gpio_set_value(gpio, value);
			
 
				-}
			
 
				-
			
 
				-#define gpio_cansleep	__gpio_cansleep
			
 
				-
			
 
				-#endif
			
--- a/arch/arm/mach-sa1100/include/mach/h3xxx.h
+++ b/arch/arm/mach-sa1100/include/mach/h3xxx.h
@@ -13,6 +13,8 @@
 
				 #ifndef _INCLUDE_H3XXX_H_
			
 
				 #define _INCLUDE_H3XXX_H_
			
 
				 
			
 
				+#include "hardware.h" /* Gives GPIO_MAX */
			
 
				+
			
 
				 /* Physical memory regions corresponding to chip selects */
			
 
				 #define H3600_EGPIO_PHYS	(SA1100_CS5_PHYS + 0x01000000)
			
 
				 #define H3600_BANK_2_PHYS	SA1100_CS2_PHYS
			
--- a/arch/arm/mach-sa1100/simpad.c
+++ b/arch/arm/mach-sa1100/simpad.c
@@ -19,6 +19,7 @@
 
				 
			
 
				 #include <mach/hardware.h>
			
 
				 #include <asm/setup.h>
			
 
				+#include <asm/irq.h>
			
 
				 
			
 
				 #include <asm/mach-types.h>
			
 
				 #include <asm/mach/arch.h>
			
--- a/arch/arm/mach-tegra/Kconfig
+++ b/arch/arm/mach-tegra/Kconfig
@@ -51,7 +51,7 @@ config ARCH_TEGRA_3x_SOC
 
				 
			
 
				 config ARCH_TEGRA_114_SOC
			
 
				 	bool "Enable support for Tegra114 family"
			
 
				-	select ARM_ERRATA_798181
			
 
				+	select ARM_ERRATA_798181 if SMP
			
 
				 	select ARM_L1_CACHE_SHIFT_6
			
 
				 	select HAVE_ARM_ARCH_TIMER
			
 
				 	select PINCTRL_TEGRA114
			
--- a/arch/arm/mach-vexpress/Kconfig
+++ b/arch/arm/mach-vexpress/Kconfig
@@ -1,6 +1,7 @@
 
				 config ARCH_VEXPRESS
			
 
				 	bool "ARM Ltd. Versatile Express family" if ARCH_MULTI_V7
			
 
				 	select ARCH_REQUIRE_GPIOLIB
			
 
				+	select ARCH_SUPPORTS_BIG_ENDIAN
			
 
				 	select ARM_AMBA
			
 
				 	select ARM_GIC
			
 
				 	select ARM_TIMER_SP804
			
--- a/arch/arm/mach-vexpress/dcscb.c
+++ b/arch/arm/mach-vexpress/dcscb.c
@@ -133,38 +133,8 @@ static void dcscb_power_down(void)
 
				 	if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
			
 
				 		arch_spin_unlock(&dcscb_lock);
			
 
				 
			
 
				-		/*
			
 
				-		 * Flush all cache levels for this cluster.
			
 
				-		 *
			
 
				-		 * To do so we do:
			
 
				-		 * - Clear the SCTLR.C bit to prevent further cache allocations
			
 
				-		 * - Flush the whole cache
			
 
				-		 * - Clear the ACTLR "SMP" bit to disable local coherency
			
 
				-		 *
			
 
				-		 * Let's do it in the safest possible way i.e. with
			
 
				-		 * no memory access within the following sequence
			
 
				-		 * including to the stack.
			
 
				-		 *
			
 
				-		 * Note: fp is preserved to the stack explicitly prior doing
			
 
				-		 * this since adding it to the clobber list is incompatible
			
 
				-		 * with having CONFIG_FRAME_POINTER=y.
			
 
				-		 */
			
 
				-		asm volatile(
			
 
				-		"str	fp, [sp, #-4]! \n\t"
			
 
				-		"mrc	p15, 0, r0, c1, c0, 0	@ get CR \n\t"
			
 
				-		"bic	r0, r0, #"__stringify(CR_C)" \n\t"
			
 
				-		"mcr	p15, 0, r0, c1, c0, 0	@ set CR \n\t"
			
 
				-		"isb	\n\t"
			
 
				-		"bl	v7_flush_dcache_all \n\t"
			
 
				-		"clrex	\n\t"
			
 
				-		"mrc	p15, 0, r0, c1, c0, 1	@ get AUXCR \n\t"
			
 
				-		"bic	r0, r0, #(1 << 6)	@ disable local coherency \n\t"
			
 
				-		"mcr	p15, 0, r0, c1, c0, 1	@ set AUXCR \n\t"
			
 
				-		"isb	\n\t"
			
 
				-		"dsb	\n\t"
			
 
				-		"ldr	fp, [sp], #4"
			
 
				-		: : : "r0","r1","r2","r3","r4","r5","r6","r7",
			
 
				-		      "r9","r10","lr","memory");
			
 
				+		/* Flush all cache levels for this cluster. */
			
 
				+		v7_exit_coherency_flush(all);
			
 
				 
			
 
				 		/*
			
 
				 		 * This is a harmless no-op.  On platforms with a real
			
@@ -183,26 +153,8 @@ static void dcscb_power_down(void)
 
				 	} else {
			
 
				 		arch_spin_unlock(&dcscb_lock);
			
 
				 
			
 
				-		/*
			
 
				-		 * Flush the local CPU cache.
			
 
				-		 * Let's do it in the safest possible way as above.
			
 
				-		 */
			
 
				-		asm volatile(
			
 
				-		"str	fp, [sp, #-4]! \n\t"
			
 
				-		"mrc	p15, 0, r0, c1, c0, 0	@ get CR \n\t"
			
 
				-		"bic	r0, r0, #"__stringify(CR_C)" \n\t"
			
 
				-		"mcr	p15, 0, r0, c1, c0, 0	@ set CR \n\t"
			
 
				-		"isb	\n\t"
			
 
				-		"bl	v7_flush_dcache_louis \n\t"
			
 
				-		"clrex	\n\t"
			
 
				-		"mrc	p15, 0, r0, c1, c0, 1	@ get AUXCR \n\t"
			
 
				-		"bic	r0, r0, #(1 << 6)	@ disable local coherency \n\t"
			
 
				-		"mcr	p15, 0, r0, c1, c0, 1	@ set AUXCR \n\t"
			
 
				-		"isb	\n\t"
			
 
				-		"dsb	\n\t"
			
 
				-		"ldr	fp, [sp], #4"
			
 
				-		: : : "r0","r1","r2","r3","r4","r5","r6","r7",
			
 
				-		      "r9","r10","lr","memory");
			
 
				+		/* Disable and flush the local CPU cache. */
			
 
				+		v7_exit_coherency_flush(louis);
			
 
				 	}
			
 
				 
			
 
				 	__mcpm_cpu_down(cpu, cluster);
			
--- a/arch/arm/mach-vexpress/tc2_pm.c
+++ b/arch/arm/mach-vexpress/tc2_pm.c
@@ -156,32 +156,7 @@ static void tc2_pm_down(u64 residency)
 
				 			: : "r" (0x400) );
			
 
				 		}
			
 
				 
			
 
				-		/*
			
 
				-		 * We need to disable and flush the whole (L1 and L2) cache.
			
 
				-		 * Let's do it in the safest possible way i.e. with
			
 
				-		 * no memory access within the following sequence
			
 
				-		 * including the stack.
			
 
				-		 *
			
 
				-		 * Note: fp is preserved to the stack explicitly prior doing
			
 
				-		 * this since adding it to the clobber list is incompatible
			
 
				-		 * with having CONFIG_FRAME_POINTER=y.
			
 
				-		 */
			
 
				-		asm volatile(
			
 
				-		"str	fp, [sp, #-4]! \n\t"
			
 
				-		"mrc	p15, 0, r0, c1, c0, 0	@ get CR \n\t"
			
 
				-		"bic	r0, r0, #"__stringify(CR_C)" \n\t"
			
 
				-		"mcr	p15, 0, r0, c1, c0, 0	@ set CR \n\t"
			
 
				-		"isb	\n\t"
			
 
				-		"bl	v7_flush_dcache_all \n\t"
			
 
				-		"clrex	\n\t"
			
 
				-		"mrc	p15, 0, r0, c1, c0, 1	@ get AUXCR \n\t"
			
 
				-		"bic	r0, r0, #(1 << 6)	@ disable local coherency \n\t"
			
 
				-		"mcr	p15, 0, r0, c1, c0, 1	@ set AUXCR \n\t"
			
 
				-		"isb	\n\t"
			
 
				-		"dsb	\n\t"
			
 
				-		"ldr	fp, [sp], #4"
			
 
				-		: : : "r0","r1","r2","r3","r4","r5","r6","r7",
			
 
				-		      "r9","r10","lr","memory");
			
 
				+		v7_exit_coherency_flush(all);
			
 
				 
			
 
				 		cci_disable_port_by_cpu(mpidr);
			
 
				 
			
@@ -197,26 +172,7 @@ static void tc2_pm_down(u64 residency)
 
				 
			
 
				 		arch_spin_unlock(&tc2_pm_lock);
			
 
				 
			
 
				-		/*
			
 
				-		 * We need to disable and flush only the L1 cache.
			
 
				-		 * Let's do it in the safest possible way as above.
			
 
				-		 */
			
 
				-		asm volatile(
			
 
				-		"str	fp, [sp, #-4]! \n\t"
			
 
				-		"mrc	p15, 0, r0, c1, c0, 0	@ get CR \n\t"
			
 
				-		"bic	r0, r0, #"__stringify(CR_C)" \n\t"
			
 
				-		"mcr	p15, 0, r0, c1, c0, 0	@ set CR \n\t"
			
 
				-		"isb	\n\t"
			
 
				-		"bl	v7_flush_dcache_louis \n\t"
			
 
				-		"clrex	\n\t"
			
 
				-		"mrc	p15, 0, r0, c1, c0, 1	@ get AUXCR \n\t"
			
 
				-		"bic	r0, r0, #(1 << 6)	@ disable local coherency \n\t"
			
 
				-		"mcr	p15, 0, r0, c1, c0, 1	@ set AUXCR \n\t"
			
 
				-		"isb	\n\t"
			
 
				-		"dsb	\n\t"
			
 
				-		"ldr	fp, [sp], #4"
			
 
				-		: : : "r0","r1","r2","r3","r4","r5","r6","r7",
			
 
				-		      "r9","r10","lr","memory");
			
 
				+		v7_exit_coherency_flush(louis);
			
 
				 	}
			
 
				 
			
 
				 	__mcpm_cpu_down(cpu, cluster);
			
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -952,3 +952,9 @@ config ARCH_HAS_BARRIERS
 
				 	help
			
 
				 	  This option allows the use of custom mandatory barriers
			
 
				 	  included via the mach/barriers.h file.
			
 
				+
			
 
				+config ARCH_SUPPORTS_BIG_ENDIAN
			
 
				+	bool
			
 
				+	help
			
 
				+	  This option specifies the architecture can support big endian
			
 
				+	  operation.
			
--- a/arch/arm/mm/abort-ev6.S
+++ b/arch/arm/mm/abort-ev6.S
@@ -38,9 +38,8 @@ ENTRY(v6_early_abort)
 
				 	bne	do_DataAbort
			
 
				 	bic	r1, r1, #1 << 11		@ clear bit 11 of FSR
			
 
				 	ldr	r3, [r4]			@ read aborted ARM instruction
			
 
				-#ifdef CONFIG_CPU_ENDIAN_BE8
			
 
				-	rev	r3, r3
			
 
				-#endif
			
 
				+ ARM_BE8(rev	r3, r3)
			
 
				+
			
 
				 	do_ldrd_abort tmp=ip, insn=r3
			
 
				 	tst	r3, #1 << 20			@ L = 0 -> write
			
 
				 	orreq	r1, r1, #1 << 11		@ yes.
			
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -25,6 +25,7 @@
 
				 #include <asm/cp15.h>
			
 
				 #include <asm/system_info.h>
			
 
				 #include <asm/unaligned.h>
			
 
				+#include <asm/opcodes.h>
			
 
				 
			
 
				 #include "fault.h"
			
 
				 
			
@@ -762,21 +763,25 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 
				 	if (thumb_mode(regs)) {
			
 
				 		u16 *ptr = (u16 *)(instrptr & ~1);
			
 
				 		fault = probe_kernel_address(ptr, tinstr);
			
 
				+		tinstr = __mem_to_opcode_thumb16(tinstr);
			
 
				 		if (!fault) {
			
 
				 			if (cpu_architecture() >= CPU_ARCH_ARMv7 &&
			
 
				 			    IS_T32(tinstr)) {
			
 
				 				/* Thumb-2 32-bit */
			
 
				 				u16 tinst2 = 0;
			
 
				 				fault = probe_kernel_address(ptr + 1, tinst2);
			
 
				-				instr = (tinstr << 16) | tinst2;
			
 
				+				tinst2 = __mem_to_opcode_thumb16(tinst2);
			
 
				+				instr = __opcode_thumb32_compose(tinstr, tinst2);
			
 
				 				thumb2_32b = 1;
			
 
				 			} else {
			
 
				 				isize = 2;
			
 
				 				instr = thumb2arm(tinstr);
			
 
				 			}
			
 
				 		}
			
 
				-	} else
			
 
				+	} else {
			
 
				 		fault = probe_kernel_address(instrptr, instr);
			
 
				+		instr = __mem_to_opcode_arm(instr);
			
 
				+	}
			
 
				 
			
 
				 	if (fault) {
			
 
				 		type = TYPE_FAULT;
			
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -707,7 +707,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 
				 void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
			
 
				 		    gfp_t gfp, struct dma_attrs *attrs)
			
 
				 {
			
 
				-	pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel);
			
 
				+	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
			
 
				 	void *memory;
			
 
				 
			
 
				 	if (dma_alloc_from_coherent(dev, size, handle, &memory))
			
@@ -720,7 +720,7 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 
				 static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
			
 
				 	dma_addr_t *handle, gfp_t gfp, struct dma_attrs *attrs)
			
 
				 {
			
 
				-	pgprot_t prot = __get_dma_pgprot(attrs, pgprot_kernel);
			
 
				+	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
			
 
				 	void *memory;
			
 
				 
			
 
				 	if (dma_alloc_from_coherent(dev, size, handle, &memory))
			
--- a/arch/arm/mm/extable.c
+++ b/arch/arm/mm/extable.c
@@ -9,8 +9,13 @@ int fixup_exception(struct pt_regs *regs)
 
				 	const struct exception_table_entry *fixup;
			
 
				 
			
 
				 	fixup = search_exception_tables(instruction_pointer(regs));
			
 
				-	if (fixup)
			
 
				+	if (fixup) {
			
 
				 		regs->ARM_pc = fixup->fixup;
			
 
				+#ifdef CONFIG_THUMB2_KERNEL
			
 
				+		/* Clear the IT state to avoid nasty surprises in the fixup */
			
 
				+		regs->ARM_cpsr &= ~PSR_IT_MASK;
			
 
				+#endif
			
 
				+	}
			
 
				 
			
 
				 	return fixup != NULL;
			
 
				 }
			
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -10,6 +10,7 @@
 
				 #include <asm/system_info.h>
			
 
				 
			
 
				 pgd_t *idmap_pgd;
			
 
				+phys_addr_t (*arch_virt_to_idmap) (unsigned long x);
			
 
				 
			
 
				 #ifdef CONFIG_ARM_LPAE
			
 
				 static void idmap_add_pmd(pud_t *pud, unsigned long addr, unsigned long end,
			
@@ -67,8 +68,9 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start,
 
				 	unsigned long addr, end;
			
 
				 	unsigned long next;
			
 
				 
			
 
				-	addr = virt_to_phys(text_start);
			
 
				-	end = virt_to_phys(text_end);
			
 
				+	addr = virt_to_idmap(text_start);
			
 
				+	end = virt_to_idmap(text_end);
			
 
				+	pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end);
			
 
				 
			
 
				 	prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
			
 
				 
			
@@ -90,8 +92,6 @@ static int __init init_static_idmap(void)
 
				 	if (!idmap_pgd)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				-	pr_info("Setting up static identity map for 0x%p - 0x%p\n",
			
 
				-		__idmap_text_start, __idmap_text_end);
			
 
				 	identity_mapping_add(idmap_pgd, __idmap_text_start,
			
 
				 			     __idmap_text_end, 0);
			
 
				 
			
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -202,13 +202,11 @@ int valid_phys_addr_range(phys_addr_t addr, size_t size)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * We don't use supersection mappings for mmap() on /dev/mem, which
			
 
				- * means that we can't map the memory area above the 4G barrier into
			
 
				- * userspace.
			
 
				+ * Do not allow /dev/mem mappings beyond the supported physical range.
			
 
				  */
			
 
				 int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
			
 
				 {
			
 
				-	return !(pfn + (size >> PAGE_SHIFT) > 0x00100000);
			
 
				+	return (pfn + (size >> PAGE_SHIFT)) <= (1 + (PHYS_MASK >> PAGE_SHIFT));
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_STRICT_DEVMEM
			
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -28,6 +28,8 @@
 
				 #include <asm/highmem.h>
			
 
				 #include <asm/system_info.h>
			
 
				 #include <asm/traps.h>
			
 
				+#include <asm/procinfo.h>
			
 
				+#include <asm/memory.h>
			
 
				 
			
 
				 #include <asm/mach/arch.h>
			
 
				 #include <asm/mach/map.h>
			
@@ -1315,6 +1317,86 @@ static void __init map_lowmem(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_ARM_LPAE
			
 
				+/*
			
 
				+ * early_paging_init() recreates boot time page table setup, allowing machines
			
 
				+ * to switch over to a high (>4G) address space on LPAE systems
			
 
				+ */
			
 
				+void __init early_paging_init(const struct machine_desc *mdesc,
			
 
				+			      struct proc_info_list *procinfo)
			
 
				+{
			
 
				+	pmdval_t pmdprot = procinfo->__cpu_mm_mmu_flags;
			
 
				+	unsigned long map_start, map_end;
			
 
				+	pgd_t *pgd0, *pgdk;
			
 
				+	pud_t *pud0, *pudk, *pud_start;
			
 
				+	pmd_t *pmd0, *pmdk;
			
 
				+	phys_addr_t phys;
			
 
				+	int i;
			
 
				+
			
 
				+	if (!(mdesc->init_meminfo))
			
 
				+		return;
			
 
				+
			
 
				+	/* remap kernel code and data */
			
 
				+	map_start = init_mm.start_code;
			
 
				+	map_end   = init_mm.brk;
			
 
				+
			
 
				+	/* get a handle on things... */
			
 
				+	pgd0 = pgd_offset_k(0);
			
 
				+	pud_start = pud0 = pud_offset(pgd0, 0);
			
 
				+	pmd0 = pmd_offset(pud0, 0);
			
 
				+
			
 
				+	pgdk = pgd_offset_k(map_start);
			
 
				+	pudk = pud_offset(pgdk, map_start);
			
 
				+	pmdk = pmd_offset(pudk, map_start);
			
 
				+
			
 
				+	mdesc->init_meminfo();
			
 
				+
			
 
				+	/* Run the patch stub to update the constants */
			
 
				+	fixup_pv_table(&__pv_table_begin,
			
 
				+		(&__pv_table_end - &__pv_table_begin) << 2);
			
 
				+
			
 
				+	/*
			
 
				+	 * Cache cleaning operations for self-modifying code
			
 
				+	 * We should clean the entries by MVA but running a
			
 
				+	 * for loop over every pv_table entry pointer would
			
 
				+	 * just complicate the code.
			
 
				+	 */
			
 
				+	flush_cache_louis();
			
 
				+	dsb();
			
 
				+	isb();
			
 
				+
			
 
				+	/* remap level 1 table */
			
 
				+	for (i = 0; i < PTRS_PER_PGD; pud0++, i++) {
			
 
				+		set_pud(pud0,
			
 
				+			__pud(__pa(pmd0) | PMD_TYPE_TABLE | L_PGD_SWAPPER));
			
 
				+		pmd0 += PTRS_PER_PMD;
			
 
				+	}
			
 
				+
			
 
				+	/* remap pmds for kernel mapping */
			
 
				+	phys = __pa(map_start) & PMD_MASK;
			
 
				+	do {
			
 
				+		*pmdk++ = __pmd(phys | pmdprot);
			
 
				+		phys += PMD_SIZE;
			
 
				+	} while (phys < map_end);
			
 
				+
			
 
				+	flush_cache_all();
			
 
				+	cpu_switch_mm(pgd0, &init_mm);
			
 
				+	cpu_set_ttbr(1, __pa(pgd0) + TTBR1_OFFSET);
			
 
				+	local_flush_bp_all();
			
 
				+	local_flush_tlb_all();
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+void __init early_paging_init(const struct machine_desc *mdesc,
			
 
				+			      struct proc_info_list *procinfo)
			
 
				+{
			
 
				+	if (mdesc->init_meminfo)
			
 
				+		mdesc->init_meminfo();
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * paging_init() sets up the page tables, initialises the zone memory
			
 
				  * maps, and sets up the zero page, bad page and bad page tables.
			
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -295,6 +295,15 @@ void __init sanity_check_meminfo(void)
 
				 	high_memory = __va(end - 1) + 1;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * early_paging_init() recreates boot time page table setup, allowing machines
			
 
				+ * to switch over to a high (>4G) address space on LPAE systems
			
 
				+ */
			
 
				+void __init early_paging_init(const struct machine_desc *mdesc,
			
 
				+			      struct proc_info_list *procinfo)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * paging_init() sets up the page tables, initialises the zone memory
			
 
				  * maps, and sets up the zero page, bad page and bad page tables.
			
--- a/arch/arm/mm/proc-v6.S
+++ b/arch/arm/mm/proc-v6.S
@@ -220,9 +220,7 @@ __v6_setup:
 
				 #endif /* CONFIG_MMU */
			
 
				 	adr	r5, v6_crval
			
 
				 	ldmia	r5, {r5, r6}
			
 
				-#ifdef CONFIG_CPU_ENDIAN_BE8
			
 
				-	orr	r6, r6, #1 << 25		@ big-endian page tables
			
 
				-#endif
			
 
				+ ARM_BE8(orr	r6, r6, #1 << 25)		@ big-endian page tables
			
 
				 	mrc	p15, 0, r0, c1, c0, 0		@ read control register
			
 
				 	bic	r0, r0, r5			@ clear bits them
			
 
				 	orr	r0, r0, r6			@ set them
			
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -367,9 +367,7 @@ __v7_setup:
 
				 #endif
			
 
				 	adr	r5, v7_crval
			
 
				 	ldmia	r5, {r5, r6}
			
 
				-#ifdef CONFIG_CPU_ENDIAN_BE8
			
 
				-	orr	r6, r6, #1 << 25		@ big-endian page tables
			
 
				-#endif
			
 
				+ ARM_BE8(orr	r6, r6, #1 << 25)		@ big-endian page tables
			
 
				 #ifdef CONFIG_SWP_EMULATE
			
 
				 	orr     r5, r5, #(1 << 10)              @ set SW bit in "clear"
			
 
				 	bic     r6, r6, #(1 << 10)              @ clear it in "mmuset"
			
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -19,6 +19,7 @@
 
				 #include <linux/if_vlan.h>
			
 
				 #include <asm/cacheflush.h>
			
 
				 #include <asm/hwcap.h>
			
 
				+#include <asm/opcodes.h>
			
 
				 
			
 
				 #include "bpf_jit_32.h"
			
 
				 
			
@@ -113,8 +114,11 @@ static u32 jit_udiv(u32 dividend, u32 divisor)
 
				 
			
 
				 static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
			
 
				 {
			
 
				+	inst |= (cond << 28);
			
 
				+	inst = __opcode_to_mem_arm(inst);
			
 
				+
			
 
				 	if (ctx->target != NULL)
			
 
				-		ctx->target[ctx->idx] = inst | (cond << 28);
			
 
				+		ctx->target[ctx->idx] = inst;
			
 
				 
			
 
				 	ctx->idx++;
			
 
				 }
			
--- a/arch/arm/plat-versatile/headsmp.S
+++ b/arch/arm/plat-versatile/headsmp.S
@@ -10,6 +10,7 @@
 
				  */
			
 
				 #include <linux/linkage.h>
			
 
				 #include <linux/init.h>
			
 
				+#include <asm/assembler.h>
			
 
				 
			
 
				 /*
			
 
				  * Realview/Versatile Express specific entry point for secondary CPUs.
			
@@ -17,6 +18,7 @@
 
				  * until we're ready for them to initialise.
			
 
				  */
			
 
				 ENTRY(versatile_secondary_startup)
			
 
				+ ARM_BE8(setend	be)
			
 
				 	mrc	p15, 0, r0, c0, c0, 5
			
 
				 	bic	r0, #0xff000000
			
 
				 	adr	r4, 1f
			
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -642,9 +642,9 @@ int vfp_restore_user_hwstate(struct user_vfp __user *ufp,
 
				 static int vfp_hotplug(struct notifier_block *b, unsigned long action,
			
 
				 	void *hcpu)
			
 
				 {
			
 
				-	if (action == CPU_DYING || action == CPU_DYING_FROZEN) {
			
 
				-		vfp_force_reload((long)hcpu, current_thread_info());
			
 
				-	} else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN)
			
 
				+	if (action == CPU_DYING || action == CPU_DYING_FROZEN)
			
 
				+		vfp_current_hw_state[(long)hcpu] = NULL;
			
 
				+	else if (action == CPU_STARTING || action == CPU_STARTING_FROZEN)
			
 
				 		vfp_enable(NULL);
			
 
				 	return NOTIFY_OK;
			
 
				 }